Blame - Include/unicodeobject.h - platform/external/python/cpython2

blob: f91a5a0c8c7f35e0b7fbd8bb3b83e437bd89084e [file] [log] [blame]

Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	1	#ifndef Py_UNICODEOBJECT_H
				2	#define Py_UNICODEOBJECT_H
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	3
				4	/*
				5
				6	Unicode implementation based on original code by Fredrik Lundh,
				7	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				8	Unicode Integration Proposal (see file Misc/unicode.txt).
				9
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	10	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	11
				12
				13	Original header:
				14	--------------------------------------------------------------------
				15
				16	* Yet another Unicode string type for Python. This type supports the
				17	* 16-bit Basic Multilingual Plane (BMP) only.
				18	*
				19	* Written by Fredrik Lundh, January 1999.
				20	*
				21	* Copyright (c) 1999 by Secret Labs AB.
				22	* Copyright (c) 1999 by Fredrik Lundh.
				23	*
				24	* fredrik@pythonware.com
				25	* http://www.pythonware.com
				26	*
				27	* --------------------------------------------------------------------
				28	* This Unicode String Type is
				29	*
				30	* Copyright (c) 1999 by Secret Labs AB
				31	* Copyright (c) 1999 by Fredrik Lundh
				32	*
				33	* By obtaining, using, and/or copying this software and/or its
				34	* associated documentation, you agree that you have read, understood,
				35	* and will comply with the following terms and conditions:
				36	*
				37	* Permission to use, copy, modify, and distribute this software and its
				38	* associated documentation for any purpose and without fee is hereby
				39	* granted, provided that the above copyright notice appears in all
				40	* copies, and that both that copyright notice and this permission notice
				41	* appear in supporting documentation, and that the name of Secret Labs
				42	* AB or the author not be used in advertising or publicity pertaining to
				43	* distribution of the software without specific, written prior
				44	* permission.
				45	*
				46	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				47	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				48	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				49	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				50	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				51	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				52	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				53	* -------------------------------------------------------------------- */
				54
				55	#include "ctype.h"
				56
				57	/* === Internal API ======================================================= */
				58
				59	/* --- Internal Unicode Format -------------------------------------------- */
				60
				61	/* Set these flags if the platform has "wchar.h", "wctype.h" and the
				62	wchar_t type is a 16-bit unsigned type */
				63	/* #define HAVE_WCHAR_H */
				64	/* #define HAVE_USABLE_WCHAR_T */
				65
				66	/* Defaults for various platforms */
				67	#ifndef HAVE_USABLE_WCHAR_T
				68
				69	/* Windows has a usable wchar_t type */
				70	# if defined(MS_WIN32)
				71	# define HAVE_USABLE_WCHAR_T
				72	# endif
				73
				74	#endif
				75
				76	/* If the compiler provides a wchar_t type we try to support it
				77	through the interface functions PyUnicode_FromWideChar() and
				78	PyUnicode_AsWideChar(). */
				79
				80	#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg	1a731c6	2000-08-11 11:43:10 +0000	[diff] [blame]	81	# ifndef HAVE_WCHAR_H
				82	# define HAVE_WCHAR_H
				83	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	84	#endif
				85
				86	#ifdef HAVE_WCHAR_H
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	87	/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
				88	# ifdef _HAVE_BSDI
				89	# include <time.h>
				90	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	91	# include "wchar.h"
				92	#endif
				93
				94	#ifdef HAVE_USABLE_WCHAR_T
				95
				96	/* If the compiler defines whcar_t as a 16-bit unsigned type we can
				97	use the compiler type directly. Works fine with all modern Windows
				98	platforms. */
				99
				100	typedef wchar_t Py_UNICODE;
				101
				102	#else
				103
				104	/* Use if you have a standard ANSI compiler, without wchar_t support.
				105	If a short is not 16 bits on your platform, you have to fix the
				106	typedef below, or the module initialization code will complain. */
				107
				108	typedef unsigned short Py_UNICODE;
				109
				110	#endif
				111
Marc-André Lemburg	4327910	2000-07-07 09:01:41 +0000	[diff] [blame]	112	/*
				113	* Use this typedef when you need to represent a UTF-16 surrogate pair
				114	* as single unsigned integer.
				115	*/
				116	#if SIZEOF_INT >= 4
				117	typedef unsigned int Py_UCS4;
				118	#elif SIZEOF_LONG >= 4
				119	typedef unsigned long Py_UCS4;
				120	#endif
				121
				122
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	123	/* --- Internal Unicode Operations ---------------------------------------- */
				124
				125	/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	126	of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
				127	configure Python using --with-ctype-functions. This reduces the
				128	interpreter's code size. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	129
				130	#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
				131
				132	#include "wctype.h"
				133
				134	#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
				135
				136	#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
				137	#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
				138	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				139	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				140
				141	#define Py_UNICODE_TOLOWER(ch) towlower(ch)
				142	#define Py_UNICODE_TOUPPER(ch) towupper(ch)
				143	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				144
				145	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				146	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				147	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				148
				149	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				150	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				151	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				152
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	153	#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
				154
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	155	#else
				156
				157	#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
				158
				159	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
				160	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
				161	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				162	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				163
				164	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
				165	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
				166	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				167
				168	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				169	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				170	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				171
				172	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				173	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				174	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				175
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	176	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	177
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	178	#endif
Marc-André Lemburg	a9c103b	2000-07-03 10:52:13 +0000	[diff] [blame]	179
				180	#define Py_UNICODE_ISALNUM(ch) \
				181	(Py_UNICODE_ISALPHA(ch) \|\| \
				182	Py_UNICODE_ISDECIMAL(ch) \|\| \
				183	Py_UNICODE_ISDIGIT(ch) \|\| \
				184	Py_UNICODE_ISNUMERIC(ch))
				185
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	186	#define Py_UNICODE_COPY(target, source, length)\
				187	(memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
				188
				189	#define Py_UNICODE_FILL(target, value, length) do\
				190	{int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
				191	while (0)
				192
				193	#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg	2f4d0e9	2000-06-18 22:22:27 +0000	[diff] [blame]	194	((((string)->str + (offset)) == ((substring)->str)) &&\
				195	!memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	196	(substring)->length*sizeof(Py_UNICODE)))
				197
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	198	#ifdef __cplusplus
				199	extern "C" {
				200	#endif
				201
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	202	/* --- Unicode Type ------------------------------------------------------- */
				203
				204	typedef struct {
				205	PyObject_HEAD
				206	int length; /* Length of raw Unicode data in buffer */
				207	Py_UNICODE str; / Raw Unicode buffer */
				208	long hash; /* Hash value; -1 if not set */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	209	PyObject defenc; / (Default) Encoded version as Python
				210	string, or NULL; this is used for
				211	implementing the buffer protocol */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	212	} PyUnicodeObject;
				213
				214	extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
				215
				216	#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
				217
				218	/* Fast access macros */
				219	#define PyUnicode_GET_SIZE(op) \
				220	(((PyUnicodeObject *)(op))->length)
				221	#define PyUnicode_GET_DATA_SIZE(op) \
				222	(((PyUnicodeObject )(op))->length sizeof(Py_UNICODE))
				223	#define PyUnicode_AS_UNICODE(op) \
				224	(((PyUnicodeObject *)(op))->str)
				225	#define PyUnicode_AS_DATA(op) \
				226	((const char )((PyUnicodeObject )(op))->str)
				227
				228	/* --- Constants ---------------------------------------------------------- */
				229
				230	/* This Unicode character will be used as replacement character during
				231	decoding if the errors argument is set to "replace". Note: the
				232	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
				233	Unicode 3.0. */
				234
				235	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
				236
				237	/* === Public API ========================================================= */
				238
				239	/* --- Plain Py_UNICODE --------------------------------------------------- */
				240
				241	/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	242	size.
				243
				244	u may be NULL which causes the contents to be undefined. It is the
				245	user's responsibility to fill in the needed data afterwards. Note
				246	that modifying the Unicode object contents after construction is
				247	only allowed if u was set to NULL.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	248
				249	The buffer is copied into the new object. */
				250
				251	extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
				252	const Py_UNICODE u, / Unicode buffer */
				253	int size /* size of buffer */
				254	);
				255
				256	/* Return a read-only pointer to the Unicode object's internal
				257	Py_UNICODE buffer. */
				258
				259	extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
				260	PyObject unicode / Unicode object */
				261	);
				262
				263	/* Get the length of the Unicode object. */
				264
				265	extern DL_IMPORT(int) PyUnicode_GetSize(
				266	PyObject unicode / Unicode object */
				267	);
				268
Guido van Rossum	52c2359	2000-04-10 13:41:41 +0000	[diff] [blame]	269	/* Resize an already allocated Unicode object to the new size length.
				270
				271	*unicode is modified to point to the new (resized) object and 0
				272	returned on success.
				273
				274	This API may only be called by the function which also called the
				275	Unicode constructor. The refcount on the object must be 1. Otherwise,
				276	an error is returned.
				277
				278	Error handling is implemented as follows: an exception is set, -1
				279	is returned and *unicode left untouched.
				280
				281	*/
				282
				283	extern DL_IMPORT(int) PyUnicode_Resize(
				284	PyObject *unicode, / Pointer to the Unicode object */
				285	int length /* New length */
				286	);
				287
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	288	/* Coerce obj to an Unicode object and return a reference with
				289	incremented refcount.
				290
				291	Coercion is done in the following way:
				292
				293	1. Unicode objects are passed back as-is with incremented
				294	refcount.
				295
				296	2. String and other char buffer compatible objects are decoded
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	297	under the assumptions that they contain data using the current
				298	default encoding. Decoding is done in "strict" mode.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	299
				300	3. All other objects raise an exception.
				301
				302	The API returns NULL in case of an error. The caller is responsible
				303	for decref'ing the returned objects.
				304
				305	*/
				306
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	307	extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
				308	register PyObject obj, / Object */
				309	const char encoding, / encoding */
				310	const char errors / error handling */
				311	);
				312
				313	/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
				314	which results in using the default encoding as basis for
				315	decoding the object.
				316
				317	Coerces obj to an Unicode object and return a reference with
				318	incremented refcount.
				319
				320	The API returns NULL in case of an error. The caller is responsible
				321	for decref'ing the returned objects.
				322
				323	*/
				324
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	325	extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
				326	register PyObject obj / Object */
				327	);
				328
				329	/* --- wchar_t support for platforms which support it --------------------- */
				330
				331	#ifdef HAVE_WCHAR_H
				332
				333	/* Create a Unicode Object from the whcar_t buffer w of the given
				334	size.
				335
				336	The buffer is copied into the new object. */
				337
				338	extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
				339	register const wchar_t w, / wchar_t buffer */
				340	int size /* size of buffer */
				341	);
				342
				343	/* Copies the Unicode Object contents into the whcar_t buffer w. At
				344	most size wchar_t characters are copied.
				345
				346	Returns the number of wchar_t characters copied or -1 in case of an
				347	error. */
				348
				349	extern DL_IMPORT(int) PyUnicode_AsWideChar(
				350	PyUnicodeObject unicode, / Unicode object */
				351	register wchar_t w, / wchar_t buffer */
				352	int size /* size of buffer */
				353	);
				354
				355	#endif
				356
				357	/* === Builtin Codecs =====================================================
				358
				359	Many of these APIs take two arguments encoding and errors. These
				360	parameters encoding and errors have the same semantics as the ones
				361	of the builtin unicode() API.
				362
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	363	Setting encoding to NULL causes the default encoding to be used.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	364
				365	Error handling is set by errors which may also be set to NULL
				366	meaning to use the default handling defined for the codec. Default
				367	error handling for all builtin codecs is "strict" (ValueErrors are
				368	raised).
				369
				370	The codecs all use a similar interface. Only deviation from the
				371	generic ones are documented.
				372
				373	*/
				374
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	375	/* --- Manage the default encoding ---------------------------------------- */
				376
				377	/* Returns the currently active default encoding.
				378
				379	The default encoding is currently implemented as run-time settable
				380	process global. This may change in future versions of the
				381	interpreter to become a parameter which is managed on a per-thread
				382	basis.
				383
				384	*/
				385
Thomas Wouters	5f37591	2000-07-22 23:30:03 +0000	[diff] [blame]	386	extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	387
				388	/* Sets the currently active default encoding.
				389
				390	Returns 0 on success, -1 in case of an error.
				391
				392	*/
				393
				394	extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
				395	const char encoding / Encoding name in standard form */
				396	);
				397
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	398	/* --- Generic Codecs ----------------------------------------------------- */
				399
				400	/* Create a Unicode object by decoding the encoded string s of the
				401	given size. */
				402
				403	extern DL_IMPORT(PyObject*) PyUnicode_Decode(
				404	const char s, / encoded string */
				405	int size, /* size of buffer */
				406	const char encoding, / encoding */
				407	const char errors / error handling */
				408	);
				409
				410	/* Encodes a Py_UNICODE buffer of the given size and returns a
				411	Python string object. */
				412
				413	extern DL_IMPORT(PyObject*) PyUnicode_Encode(
				414	const Py_UNICODE s, / Unicode char buffer */
				415	int size, /* number of Py_UNICODE chars to encode */
				416	const char encoding, / encoding */
				417	const char errors / error handling */
				418	);
				419
				420	/* Encodes a Unicode object and returns the result as Python string
				421	object. */
				422
				423	extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
				424	PyObject unicode, / Unicode object */
				425	const char encoding, / encoding */
				426	const char errors / error handling */
				427	);
				428
				429	/* --- UTF-8 Codecs ------------------------------------------------------- */
				430
				431	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
				432	const char string, / UTF-8 encoded string */
				433	int length, /* size of string */
				434	const char errors / error handling */
				435	);
				436
				437	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
				438	PyObject unicode / Unicode object */
				439	);
				440
				441	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
				442	const Py_UNICODE data, / Unicode char buffer */
				443	int length, /* number of Py_UNICODE chars to encode */
				444	const char errors / error handling */
				445	);
				446
				447	/* --- UTF-16 Codecs ------------------------------------------------------ */
				448
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	450	the corresponding Unicode object.
				451
				452	errors (if non-NULL) defines the error handling. It defaults
				453	to "strict".
				454
				455	If byteorder is non-NULL, the decoder starts decoding using the
				456	given byte order:
				457
				458	*byteorder == -1: little endian
				459	*byteorder == 0: native order
				460	*byteorder == 1: big endian
				461
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	462	In native mode, the first two bytes of the stream are checked for a
				463	BOM mark. If found, the BOM mark is analysed, the byte order
				464	adjusted and the BOM skipped. In the other modes, no BOM mark
				465	interpretation is done. After completion, *byteorder is set to the
				466	current byte order at the end of input data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	467
				468	If byteorder is NULL, the codec starts in native order mode.
				469
				470	*/
				471
				472	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
				473	const char string, / UTF-16 encoded string */
				474	int length, /* size of string */
				475	const char errors, / error handling */
				476	int byteorder / pointer to byteorder to use
				477	0=native;-1=LE,1=BE; updated on
				478	exit */
				479	);
				480
				481	/* Returns a Python string using the UTF-16 encoding in native byte
				482	order. The string always starts with a BOM mark. */
				483
				484	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
				485	PyObject unicode / Unicode object */
				486	);
				487
				488	/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	489	the Unicode data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	490
				491	If byteorder is not 0, output is written according to the following
				492	byte order:
				493
				494	byteorder == -1: little endian
				495	byteorder == 0: native byte order (writes a BOM mark)
				496	byteorder == 1: big endian
				497
				498	If byteorder is 0, the output string will always start with the
				499	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
				500	prepended.
				501
				502	Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
				503	UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	504	at a later point without compromising the APIs.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	505
				506	*/
				507
				508	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
				509	const Py_UNICODE data, / Unicode char buffer */
				510	int length, /* number of Py_UNICODE chars to encode */
				511	const char errors, / error handling */
				512	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				513	);
				514
				515	/* --- Unicode-Escape Codecs ---------------------------------------------- */
				516
				517	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
				518	const char string, / Unicode-Escape encoded string */
				519	int length, /* size of string */
				520	const char errors / error handling */
				521	);
				522
				523	extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
				524	PyObject unicode / Unicode object */
				525	);
				526
				527	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
				528	const Py_UNICODE data, / Unicode char buffer */
				529	int length /* Number of Py_UNICODE chars to encode */
				530	);
				531
				532	/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
				533
				534	extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
				535	const char string, / Raw-Unicode-Escape encoded string */
				536	int length, /* size of string */
				537	const char errors / error handling */
				538	);
				539
				540	extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
				541	PyObject unicode / Unicode object */
				542	);
				543
				544	extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
				545	const Py_UNICODE data, / Unicode char buffer */
				546	int length /* Number of Py_UNICODE chars to encode */
				547	);
				548
				549	/* --- Latin-1 Codecs -----------------------------------------------------
				550
				551	Note: Latin-1 corresponds to the first 256 Unicode ordinals.
				552
				553	*/
				554
				555	extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
				556	const char string, / Latin-1 encoded string */
				557	int length, /* size of string */
				558	const char errors / error handling */
				559	);
				560
				561	extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
				562	PyObject unicode / Unicode object */
				563	);
				564
				565	extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
				566	const Py_UNICODE data, / Unicode char buffer */
				567	int length, /* Number of Py_UNICODE chars to encode */
				568	const char errors / error handling */
				569	);
				570
				571	/* --- ASCII Codecs -------------------------------------------------------
				572
				573	Only 7-bit ASCII data is excepted. All other codes generate errors.
				574
				575	*/
				576
				577	extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
				578	const char string, / ASCII encoded string */
				579	int length, /* size of string */
				580	const char errors / error handling */
				581	);
				582
				583	extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
				584	PyObject unicode / Unicode object */
				585	);
				586
				587	extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
				588	const Py_UNICODE data, / Unicode char buffer */
				589	int length, /* Number of Py_UNICODE chars to encode */
				590	const char errors / error handling */
				591	);
				592
				593	/* --- Character Map Codecs -----------------------------------------------
				594
				595	This codec uses mappings to encode and decode characters.
				596
				597	Decoding mappings must map single string characters to single
				598	Unicode characters, integers (which are then interpreted as Unicode
				599	ordinals) or None (meaning "undefined mapping" and causing an
				600	error).
				601
				602	Encoding mappings must map single Unicode characters to single
				603	string characters, integers (which are then interpreted as Latin-1
				604	ordinals) or None (meaning "undefined mapping" and causing an
				605	error).
				606
				607	If a character lookup fails with a LookupError, the character is
				608	copied as-is meaning that its ordinal value will be interpreted as
				609	Unicode or Latin-1 ordinal resp. Because of this mappings only need
				610	to contain those mappings which map characters to different code
				611	points.
				612
				613	*/
				614
				615	extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
				616	const char string, / Encoded string */
				617	int length, /* size of string */
				618	PyObject mapping, / character mapping
				619	(char ordinal -> unicode ordinal) */
				620	const char errors / error handling */
				621	);
				622
				623	extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
				624	PyObject unicode, / Unicode object */
				625	PyObject mapping / character mapping
				626	(unicode ordinal -> char ordinal) */
				627	);
				628
				629	extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
				630	const Py_UNICODE data, / Unicode char buffer */
				631	int length, /* Number of Py_UNICODE chars to encode */
				632	PyObject mapping, / character mapping
				633	(unicode ordinal -> char ordinal) */
				634	const char errors / error handling */
				635	);
				636
				637	/* Translate a Py_UNICODE buffer of the given length by applying a
				638	character mapping table to it and return the resulting Unicode
				639	object.
				640
				641	The mapping table must map Unicode ordinal integers to Unicode
				642	ordinal integers or None (causing deletion of the character).
				643
				644	Mapping tables may be dictionaries or sequences. Unmapped character
				645	ordinals (ones which cause a LookupError) are left untouched and
				646	are copied as-is.
				647
				648	*/
				649
				650	extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
				651	const Py_UNICODE data, / Unicode char buffer */
				652	int length, /* Number of Py_UNICODE chars to encode */
				653	PyObject table, / Translate table */
				654	const char errors / error handling */
				655	);
				656
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	657	#ifdef MS_WIN32
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	658
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	659	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	660
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	661	extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
				662	const char string, / MBCS encoded string */
				663	int length, /* size of string */
				664	const char errors / error handling */
				665	);
				666
				667	extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
				668	PyObject unicode / Unicode object */
				669	);
				670
				671	extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
				672	const Py_UNICODE data, / Unicode char buffer */
				673	int length, /* Number of Py_UNICODE chars to encode */
				674	const char errors / error handling */
				675	);
				676
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	677	#endif /* MS_WIN32 */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	678
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	679	/* --- Decimal Encoder ---------------------------------------------------- */
				680
				681	/* Takes a Unicode string holding a decimal value and writes it into
				682	an output buffer using standard ASCII digit codes.
				683
				684	The output buffer has to provide at least length+1 bytes of storage
				685	area. The output string is 0-terminated.
				686
				687	The encoder converts whitespace to ' ', decimal characters to their
				688	corresponding ASCII digit and all other Latin-1 characters except
				689	\0 as-is. Characters outside this range (Unicode ordinals 1-256)
				690	are treated as errors. This includes embedded NULL bytes.
				691
				692	Error handling is defined by the errors argument:
				693
				694	NULL or "strict": raise a ValueError
				695	"ignore": ignore the wrong characters (these are not copied to the
				696	output buffer)
				697	"replace": replaces illegal characters with '?'
				698
				699	Returns 0 on success, -1 on failure.
				700
				701	*/
				702
				703	extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
				704	Py_UNICODE s, / Unicode buffer */
				705	int length, /* Number of Py_UNICODE chars to encode */
				706	char output, / Output buffer; must have size >= length */
				707	const char errors / error handling */
				708	);
				709
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	710	/* --- Methods & Slots ----------------------------------------------------
				711
				712	These are capable of handling Unicode objects and strings on input
				713	(we refer to them as strings in the descriptions) and return
				714	Unicode objects or integers as apporpriate. */
				715
				716	/* Concat two strings giving a new Unicode string. */
				717
				718	extern DL_IMPORT(PyObject*) PyUnicode_Concat(
				719	PyObject left, / Left string */
				720	PyObject right / Right string */
				721	);
				722
				723	/* Split a string giving a list of Unicode strings.
				724
				725	If sep is NULL, splitting will be done at all whitespace
				726	substrings. Otherwise, splits occur at the given separator.
				727
				728	At most maxsplit splits will be done. If negative, no limit is set.
				729
				730	Separators are not included in the resulting list.
				731
				732	*/
				733
				734	extern DL_IMPORT(PyObject*) PyUnicode_Split(
				735	PyObject s, / String to split */
				736	PyObject sep, / String separator */
				737	int maxsplit /* Maxsplit count */
				738	);
				739
				740	/* Dito, but split at line breaks.
				741
				742	CRLF is considered to be one line break. Line breaks are not
				743	included in the resulting list. */
				744
				745	extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
				746	PyObject s, / String to split */
Guido van Rossum	004d64f	2000-04-11 15:39:46 +0000	[diff] [blame]	747	int keepends /* If true, line end markers are included */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	748	);
				749
				750	/* Translate a string by applying a character mapping table to it and
				751	return the resulting Unicode object.
				752
				753	The mapping table must map Unicode ordinal integers to Unicode
				754	ordinal integers or None (causing deletion of the character).
				755
				756	Mapping tables may be dictionaries or sequences. Unmapped character
				757	ordinals (ones which cause a LookupError) are left untouched and
				758	are copied as-is.
				759
				760	*/
				761
				762	extern DL_IMPORT(PyObject *) PyUnicode_Translate(
				763	PyObject str, / String */
				764	PyObject table, / Translate table */
				765	const char errors / error handling */
				766	);
				767
				768	/* Join a sequence of strings using the given separator and return
				769	the resulting Unicode string. */
				770
				771	extern DL_IMPORT(PyObject*) PyUnicode_Join(
				772	PyObject separator, / Separator string */
				773	PyObject seq / Sequence object */
				774	);
				775
				776	/* Return 1 if substr matches str[start:end] at the given tail end, 0
				777	otherwise. */
				778
				779	extern DL_IMPORT(int) PyUnicode_Tailmatch(
				780	PyObject str, / String */
				781	PyObject substr, / Prefix or Suffix string */
				782	int start, /* Start index */
				783	int end, /* Stop index */
				784	int direction /* Tail end: -1 prefix, +1 suffix */
				785	);
				786
				787	/* Return the first position of substr in str[start:end] using the
				788	given search direction or -1 if not found. */
				789
				790	extern DL_IMPORT(int) PyUnicode_Find(
				791	PyObject str, / String */
				792	PyObject substr, / Substring to find */
				793	int start, /* Start index */
				794	int end, /* Stop index */
				795	int direction /* Find direction: +1 forward, -1 backward */
				796	);
				797
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	798	/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	799
				800	extern DL_IMPORT(int) PyUnicode_Count(
				801	PyObject str, / String */
				802	PyObject substr, / Substring to count */
				803	int start, /* Start index */
				804	int end /* Stop index */
				805	);
				806
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	807	/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	808	and return the resulting Unicode object. */
				809
				810	extern DL_IMPORT(PyObject *) PyUnicode_Replace(
				811	PyObject str, / String */
				812	PyObject substr, / Substring to find */
				813	PyObject replstr, / Substring to replace */
				814	int maxcount /* Max. number of replacements to apply;
				815	-1 = all */
				816	);
				817
				818	/* Compare two strings and return -1, 0, 1 for less than, equal,
				819	greater than resp. */
				820
				821	extern DL_IMPORT(int) PyUnicode_Compare(
				822	PyObject left, / Left string */
				823	PyObject right / Right string */
				824	);
				825
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	826	/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	827	the resulting Unicode string. */
				828
				829	extern DL_IMPORT(PyObject *) PyUnicode_Format(
				830	PyObject format, / Format string */
				831	PyObject args / Argument tuple or dictionary */
				832	);
				833
Guido van Rossum	d0d366b	2000-03-13 23:22:24 +0000	[diff] [blame]	834	/* Checks whether element is contained in container and return 1/0
				835	accordingly.
				836
				837	element has to coerce to an one element Unicode string. -1 is
				838	returned in case of an error. */
				839
				840	extern DL_IMPORT(int) PyUnicode_Contains(
				841	PyObject container, / Container string */
				842	PyObject element / Element string */
				843	);
				844
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	845	/* === Characters Type APIs =============================================== */
				846
				847	/* These should not be used directly. Use the Py_UNICODE_IS* and
				848	Py_UNICODE_TO* macros instead.
				849
				850	These APIs are implemented in Objects/unicodectype.c.
				851
				852	*/
				853
				854	extern DL_IMPORT(int) _PyUnicode_IsLowercase(
				855	register const Py_UNICODE ch /* Unicode character */
				856	);
				857
				858	extern DL_IMPORT(int) _PyUnicode_IsUppercase(
				859	register const Py_UNICODE ch /* Unicode character */
				860	);
				861
				862	extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
				863	register const Py_UNICODE ch /* Unicode character */
				864	);
				865
				866	extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
				867	register const Py_UNICODE ch /* Unicode character */
				868	);
				869
				870	extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
				871	register const Py_UNICODE ch /* Unicode character */
				872	);
				873
				874	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
				875	register const Py_UNICODE ch /* Unicode character */
				876	);
				877
				878	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
				879	register const Py_UNICODE ch /* Unicode character */
				880	);
				881
				882	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
				883	register const Py_UNICODE ch /* Unicode character */
				884	);
				885
				886	extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
				887	register const Py_UNICODE ch /* Unicode character */
				888	);
				889
				890	extern DL_IMPORT(int) _PyUnicode_ToDigit(
				891	register const Py_UNICODE ch /* Unicode character */
				892	);
				893
				894	extern DL_IMPORT(double) _PyUnicode_ToNumeric(
				895	register const Py_UNICODE ch /* Unicode character */
				896	);
				897
				898	extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
				899	register const Py_UNICODE ch /* Unicode character */
				900	);
				901
				902	extern DL_IMPORT(int) _PyUnicode_IsDigit(
				903	register const Py_UNICODE ch /* Unicode character */
				904	);
				905
				906	extern DL_IMPORT(int) _PyUnicode_IsNumeric(
				907	register const Py_UNICODE ch /* Unicode character */
				908	);
				909
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	910	extern DL_IMPORT(int) _PyUnicode_IsAlpha(
				911	register const Py_UNICODE ch /* Unicode character */
				912	);
				913
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	914	#ifdef __cplusplus
				915	}
				916	#endif
				917	#endif /* !Py_UNICODEOBJECT_H */