Blame - Include/unicodeobject.h - platform/external/python/cpython3

blob: 988ea1b39ec8af35b826d40d685c1ad313c6452c [file] [log] [blame]

Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	1	#ifndef Py_UNICODEOBJECT_H
				2	#define Py_UNICODEOBJECT_H
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	3
				4	/*
				5
				6	Unicode implementation based on original code by Fredrik Lundh,
				7	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				8	Unicode Integration Proposal (see file Misc/unicode.txt).
				9
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	10	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	11
				12
				13	Original header:
				14	--------------------------------------------------------------------
				15
				16	* Yet another Unicode string type for Python. This type supports the
				17	* 16-bit Basic Multilingual Plane (BMP) only.
				18	*
				19	* Written by Fredrik Lundh, January 1999.
				20	*
				21	* Copyright (c) 1999 by Secret Labs AB.
				22	* Copyright (c) 1999 by Fredrik Lundh.
				23	*
				24	* fredrik@pythonware.com
				25	* http://www.pythonware.com
				26	*
				27	* --------------------------------------------------------------------
				28	* This Unicode String Type is
				29	*
				30	* Copyright (c) 1999 by Secret Labs AB
				31	* Copyright (c) 1999 by Fredrik Lundh
				32	*
				33	* By obtaining, using, and/or copying this software and/or its
				34	* associated documentation, you agree that you have read, understood,
				35	* and will comply with the following terms and conditions:
				36	*
				37	* Permission to use, copy, modify, and distribute this software and its
				38	* associated documentation for any purpose and without fee is hereby
				39	* granted, provided that the above copyright notice appears in all
				40	* copies, and that both that copyright notice and this permission notice
				41	* appear in supporting documentation, and that the name of Secret Labs
				42	* AB or the author not be used in advertising or publicity pertaining to
				43	* distribution of the software without specific, written prior
				44	* permission.
				45	*
				46	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				47	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				48	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				49	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				50	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				51	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				52	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				53	* -------------------------------------------------------------------- */
				54
				55	#include "ctype.h"
				56
				57	/* === Internal API ======================================================= */
				58
				59	/* --- Internal Unicode Format -------------------------------------------- */
				60
				61	/* Set these flags if the platform has "wchar.h", "wctype.h" and the
				62	wchar_t type is a 16-bit unsigned type */
				63	/* #define HAVE_WCHAR_H */
				64	/* #define HAVE_USABLE_WCHAR_T */
				65
				66	/* Defaults for various platforms */
				67	#ifndef HAVE_USABLE_WCHAR_T
				68
				69	/* Windows has a usable wchar_t type */
				70	# if defined(MS_WIN32)
				71	# define HAVE_USABLE_WCHAR_T
				72	# endif
				73
				74	#endif
				75
				76	/* If the compiler provides a wchar_t type we try to support it
				77	through the interface functions PyUnicode_FromWideChar() and
				78	PyUnicode_AsWideChar(). */
				79
				80	#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg	1a731c6	2000-08-11 11:43:10 +0000	[diff] [blame]	81	# ifndef HAVE_WCHAR_H
				82	# define HAVE_WCHAR_H
				83	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	84	#endif
				85
				86	#ifdef HAVE_WCHAR_H
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	87	/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
				88	# ifdef _HAVE_BSDI
				89	# include <time.h>
				90	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	91	# include "wchar.h"
				92	#endif
				93
				94	#ifdef HAVE_USABLE_WCHAR_T
				95
				96	/* If the compiler defines whcar_t as a 16-bit unsigned type we can
				97	use the compiler type directly. Works fine with all modern Windows
				98	platforms. */
				99
				100	typedef wchar_t Py_UNICODE;
				101
				102	#else
				103
				104	/* Use if you have a standard ANSI compiler, without wchar_t support.
				105	If a short is not 16 bits on your platform, you have to fix the
				106	typedef below, or the module initialization code will complain. */
				107
				108	typedef unsigned short Py_UNICODE;
				109
				110	#endif
				111
Marc-André Lemburg	4327910	2000-07-07 09:01:41 +0000	[diff] [blame]	112	/*
				113	* Use this typedef when you need to represent a UTF-16 surrogate pair
				114	* as single unsigned integer.
				115	*/
				116	#if SIZEOF_INT >= 4
				117	typedef unsigned int Py_UCS4;
				118	#elif SIZEOF_LONG >= 4
				119	typedef unsigned long Py_UCS4;
				120	#endif
				121
				122
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	123	/* --- Internal Unicode Operations ---------------------------------------- */
				124
				125	/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	126	of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
				127	configure Python using --with-ctype-functions. This reduces the
				128	interpreter's code size. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	129
				130	#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
				131
				132	#include "wctype.h"
				133
				134	#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
				135
				136	#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
				137	#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
				138	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				139	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				140
				141	#define Py_UNICODE_TOLOWER(ch) towlower(ch)
				142	#define Py_UNICODE_TOUPPER(ch) towupper(ch)
				143	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				144
				145	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				146	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				147	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				148
				149	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				150	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				151	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				152
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	153	#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
				154
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	155	#else
				156
				157	#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
				158
				159	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
				160	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
				161	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				162	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				163
				164	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
				165	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
				166	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				167
				168	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				169	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				170	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				171
				172	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				173	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				174	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				175
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	176	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	177
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	178	#endif
Marc-André Lemburg	a9c103b	2000-07-03 10:52:13 +0000	[diff] [blame]	179
				180	#define Py_UNICODE_ISALNUM(ch) \
				181	(Py_UNICODE_ISALPHA(ch) \|\| \
				182	Py_UNICODE_ISDECIMAL(ch) \|\| \
				183	Py_UNICODE_ISDIGIT(ch) \|\| \
				184	Py_UNICODE_ISNUMERIC(ch))
				185
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	186	#define Py_UNICODE_COPY(target, source, length)\
				187	(memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
				188
				189	#define Py_UNICODE_FILL(target, value, length) do\
				190	{int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
				191	while (0)
				192
				193	#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg	2f4d0e9	2000-06-18 22:22:27 +0000	[diff] [blame]	194	((((string)->str + (offset)) == ((substring)->str)) &&\
				195	!memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	196	(substring)->length*sizeof(Py_UNICODE)))
				197
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	198	#ifdef __cplusplus
				199	extern "C" {
				200	#endif
				201
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	202	/* --- Unicode Type ------------------------------------------------------- */
				203
				204	typedef struct {
				205	PyObject_HEAD
				206	int length; /* Length of raw Unicode data in buffer */
				207	Py_UNICODE str; / Raw Unicode buffer */
				208	long hash; /* Hash value; -1 if not set */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	209	PyObject defenc; / (Default) Encoded version as Python
				210	string, or NULL; this is used for
				211	implementing the buffer protocol */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	212	} PyUnicodeObject;
				213
				214	extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
				215
				216	#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
				217
				218	/* Fast access macros */
				219	#define PyUnicode_GET_SIZE(op) \
				220	(((PyUnicodeObject *)(op))->length)
				221	#define PyUnicode_GET_DATA_SIZE(op) \
				222	(((PyUnicodeObject )(op))->length sizeof(Py_UNICODE))
				223	#define PyUnicode_AS_UNICODE(op) \
				224	(((PyUnicodeObject *)(op))->str)
				225	#define PyUnicode_AS_DATA(op) \
				226	((const char )((PyUnicodeObject )(op))->str)
				227
				228	/* --- Constants ---------------------------------------------------------- */
				229
				230	/* This Unicode character will be used as replacement character during
				231	decoding if the errors argument is set to "replace". Note: the
				232	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
				233	Unicode 3.0. */
				234
				235	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
				236
				237	/* === Public API ========================================================= */
				238
				239	/* --- Plain Py_UNICODE --------------------------------------------------- */
				240
				241	/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	242	size.
				243
				244	u may be NULL which causes the contents to be undefined. It is the
				245	user's responsibility to fill in the needed data afterwards. Note
				246	that modifying the Unicode object contents after construction is
				247	only allowed if u was set to NULL.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	248
				249	The buffer is copied into the new object. */
				250
				251	extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
				252	const Py_UNICODE u, / Unicode buffer */
				253	int size /* size of buffer */
				254	);
				255
				256	/* Return a read-only pointer to the Unicode object's internal
				257	Py_UNICODE buffer. */
				258
				259	extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
				260	PyObject unicode / Unicode object */
				261	);
				262
				263	/* Get the length of the Unicode object. */
				264
				265	extern DL_IMPORT(int) PyUnicode_GetSize(
				266	PyObject unicode / Unicode object */
				267	);
				268
Guido van Rossum	52c2359	2000-04-10 13:41:41 +0000	[diff] [blame]	269	/* Resize an already allocated Unicode object to the new size length.
				270
				271	*unicode is modified to point to the new (resized) object and 0
				272	returned on success.
				273
				274	This API may only be called by the function which also called the
				275	Unicode constructor. The refcount on the object must be 1. Otherwise,
				276	an error is returned.
				277
				278	Error handling is implemented as follows: an exception is set, -1
				279	is returned and *unicode left untouched.
				280
				281	*/
				282
				283	extern DL_IMPORT(int) PyUnicode_Resize(
				284	PyObject *unicode, / Pointer to the Unicode object */
				285	int length /* New length */
				286	);
				287
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	288	/* Coerce obj to an Unicode object and return a reference with
				289	incremented refcount.
				290
				291	Coercion is done in the following way:
				292
				293	1. Unicode objects are passed back as-is with incremented
				294	refcount.
				295
				296	2. String and other char buffer compatible objects are decoded
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	297	under the assumptions that they contain data using the current
				298	default encoding. Decoding is done in "strict" mode.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	299
				300	3. All other objects raise an exception.
				301
				302	The API returns NULL in case of an error. The caller is responsible
				303	for decref'ing the returned objects.
				304
				305	*/
				306
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	307	extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
				308	register PyObject obj, / Object */
				309	const char encoding, / encoding */
				310	const char errors / error handling */
				311	);
				312
				313	/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
				314	which results in using the default encoding as basis for
				315	decoding the object.
				316
				317	Coerces obj to an Unicode object and return a reference with
				318	incremented refcount.
				319
				320	The API returns NULL in case of an error. The caller is responsible
				321	for decref'ing the returned objects.
				322
				323	*/
				324
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	325	extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
				326	register PyObject obj / Object */
				327	);
				328
				329	/* --- wchar_t support for platforms which support it --------------------- */
				330
				331	#ifdef HAVE_WCHAR_H
				332
				333	/* Create a Unicode Object from the whcar_t buffer w of the given
				334	size.
				335
				336	The buffer is copied into the new object. */
				337
				338	extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
				339	register const wchar_t w, / wchar_t buffer */
				340	int size /* size of buffer */
				341	);
				342
				343	/* Copies the Unicode Object contents into the whcar_t buffer w. At
				344	most size wchar_t characters are copied.
				345
				346	Returns the number of wchar_t characters copied or -1 in case of an
				347	error. */
				348
				349	extern DL_IMPORT(int) PyUnicode_AsWideChar(
				350	PyUnicodeObject unicode, / Unicode object */
				351	register wchar_t w, / wchar_t buffer */
				352	int size /* size of buffer */
				353	);
				354
				355	#endif
				356
				357	/* === Builtin Codecs =====================================================
				358
				359	Many of these APIs take two arguments encoding and errors. These
				360	parameters encoding and errors have the same semantics as the ones
				361	of the builtin unicode() API.
				362
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	363	Setting encoding to NULL causes the default encoding to be used.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	364
				365	Error handling is set by errors which may also be set to NULL
				366	meaning to use the default handling defined for the codec. Default
				367	error handling for all builtin codecs is "strict" (ValueErrors are
				368	raised).
				369
				370	The codecs all use a similar interface. Only deviation from the
				371	generic ones are documented.
				372
				373	*/
				374
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	375	/* --- Manage the default encoding ---------------------------------------- */
				376
				377	/* Returns the currently active default encoding.
				378
				379	The default encoding is currently implemented as run-time settable
				380	process global. This may change in future versions of the
				381	interpreter to become a parameter which is managed on a per-thread
				382	basis.
				383
				384	*/
				385
Thomas Wouters	5f37591	2000-07-22 23:30:03 +0000	[diff] [blame]	386	extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	387
				388	/* Sets the currently active default encoding.
				389
				390	Returns 0 on success, -1 in case of an error.
				391
				392	*/
				393
				394	extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
				395	const char encoding / Encoding name in standard form */
				396	);
				397
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	398	/* --- Generic Codecs ----------------------------------------------------- */
				399
				400	/* Create a Unicode object by decoding the encoded string s of the
				401	given size. */
				402
				403	extern DL_IMPORT(PyObject*) PyUnicode_Decode(
				404	const char s, / encoded string */
				405	int size, /* size of buffer */
				406	const char encoding, / encoding */
				407	const char errors / error handling */
				408	);
				409
				410	/* Encodes a Py_UNICODE buffer of the given size and returns a
				411	Python string object. */
				412
				413	extern DL_IMPORT(PyObject*) PyUnicode_Encode(
				414	const Py_UNICODE s, / Unicode char buffer */
				415	int size, /* number of Py_UNICODE chars to encode */
				416	const char encoding, / encoding */
				417	const char errors / error handling */
				418	);
				419
				420	/* Encodes a Unicode object and returns the result as Python string
				421	object. */
				422
				423	extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
				424	PyObject unicode, / Unicode object */
				425	const char encoding, / encoding */
				426	const char errors / error handling */
				427	);
				428
				429	/* --- UTF-8 Codecs ------------------------------------------------------- */
				430
				431	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
				432	const char string, / UTF-8 encoded string */
				433	int length, /* size of string */
				434	const char errors / error handling */
				435	);
				436
				437	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
				438	PyObject unicode / Unicode object */
				439	);
				440
				441	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
				442	const Py_UNICODE data, / Unicode char buffer */
				443	int length, /* number of Py_UNICODE chars to encode */
				444	const char errors / error handling */
				445	);
				446
				447	/* --- UTF-16 Codecs ------------------------------------------------------ */
				448
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	450	the corresponding Unicode object.
				451
				452	errors (if non-NULL) defines the error handling. It defaults
				453	to "strict".
				454
				455	If byteorder is non-NULL, the decoder starts decoding using the
				456	given byte order:
				457
				458	*byteorder == -1: little endian
				459	*byteorder == 0: native order
				460	*byteorder == 1: big endian
				461
				462	and then switches according to all BOM marks it finds in the input
				463	data. BOM marks are not copied into the resulting Unicode string.
				464	After completion, *byteorder is set to the current byte order at
				465	the end of input data.
				466
				467	If byteorder is NULL, the codec starts in native order mode.
				468
				469	*/
				470
				471	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
				472	const char string, / UTF-16 encoded string */
				473	int length, /* size of string */
				474	const char errors, / error handling */
				475	int byteorder / pointer to byteorder to use
				476	0=native;-1=LE,1=BE; updated on
				477	exit */
				478	);
				479
				480	/* Returns a Python string using the UTF-16 encoding in native byte
				481	order. The string always starts with a BOM mark. */
				482
				483	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
				484	PyObject unicode / Unicode object */
				485	);
				486
				487	/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	488	the Unicode data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	489
				490	If byteorder is not 0, output is written according to the following
				491	byte order:
				492
				493	byteorder == -1: little endian
				494	byteorder == 0: native byte order (writes a BOM mark)
				495	byteorder == 1: big endian
				496
				497	If byteorder is 0, the output string will always start with the
				498	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
				499	prepended.
				500
				501	Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
				502	UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	503	at a later point without compromising the APIs.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	504
				505	*/
				506
				507	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
				508	const Py_UNICODE data, / Unicode char buffer */
				509	int length, /* number of Py_UNICODE chars to encode */
				510	const char errors, / error handling */
				511	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				512	);
				513
				514	/* --- Unicode-Escape Codecs ---------------------------------------------- */
				515
				516	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
				517	const char string, / Unicode-Escape encoded string */
				518	int length, /* size of string */
				519	const char errors / error handling */
				520	);
				521
				522	extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
				523	PyObject unicode / Unicode object */
				524	);
				525
				526	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
				527	const Py_UNICODE data, / Unicode char buffer */
				528	int length /* Number of Py_UNICODE chars to encode */
				529	);
				530
				531	/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
				532
				533	extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
				534	const char string, / Raw-Unicode-Escape encoded string */
				535	int length, /* size of string */
				536	const char errors / error handling */
				537	);
				538
				539	extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
				540	PyObject unicode / Unicode object */
				541	);
				542
				543	extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
				544	const Py_UNICODE data, / Unicode char buffer */
				545	int length /* Number of Py_UNICODE chars to encode */
				546	);
				547
				548	/* --- Latin-1 Codecs -----------------------------------------------------
				549
				550	Note: Latin-1 corresponds to the first 256 Unicode ordinals.
				551
				552	*/
				553
				554	extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
				555	const char string, / Latin-1 encoded string */
				556	int length, /* size of string */
				557	const char errors / error handling */
				558	);
				559
				560	extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
				561	PyObject unicode / Unicode object */
				562	);
				563
				564	extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
				565	const Py_UNICODE data, / Unicode char buffer */
				566	int length, /* Number of Py_UNICODE chars to encode */
				567	const char errors / error handling */
				568	);
				569
				570	/* --- ASCII Codecs -------------------------------------------------------
				571
				572	Only 7-bit ASCII data is excepted. All other codes generate errors.
				573
				574	*/
				575
				576	extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
				577	const char string, / ASCII encoded string */
				578	int length, /* size of string */
				579	const char errors / error handling */
				580	);
				581
				582	extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
				583	PyObject unicode / Unicode object */
				584	);
				585
				586	extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
				587	const Py_UNICODE data, / Unicode char buffer */
				588	int length, /* Number of Py_UNICODE chars to encode */
				589	const char errors / error handling */
				590	);
				591
				592	/* --- Character Map Codecs -----------------------------------------------
				593
				594	This codec uses mappings to encode and decode characters.
				595
				596	Decoding mappings must map single string characters to single
				597	Unicode characters, integers (which are then interpreted as Unicode
				598	ordinals) or None (meaning "undefined mapping" and causing an
				599	error).
				600
				601	Encoding mappings must map single Unicode characters to single
				602	string characters, integers (which are then interpreted as Latin-1
				603	ordinals) or None (meaning "undefined mapping" and causing an
				604	error).
				605
				606	If a character lookup fails with a LookupError, the character is
				607	copied as-is meaning that its ordinal value will be interpreted as
				608	Unicode or Latin-1 ordinal resp. Because of this mappings only need
				609	to contain those mappings which map characters to different code
				610	points.
				611
				612	*/
				613
				614	extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
				615	const char string, / Encoded string */
				616	int length, /* size of string */
				617	PyObject mapping, / character mapping
				618	(char ordinal -> unicode ordinal) */
				619	const char errors / error handling */
				620	);
				621
				622	extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
				623	PyObject unicode, / Unicode object */
				624	PyObject mapping / character mapping
				625	(unicode ordinal -> char ordinal) */
				626	);
				627
				628	extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
				629	const Py_UNICODE data, / Unicode char buffer */
				630	int length, /* Number of Py_UNICODE chars to encode */
				631	PyObject mapping, / character mapping
				632	(unicode ordinal -> char ordinal) */
				633	const char errors / error handling */
				634	);
				635
				636	/* Translate a Py_UNICODE buffer of the given length by applying a
				637	character mapping table to it and return the resulting Unicode
				638	object.
				639
				640	The mapping table must map Unicode ordinal integers to Unicode
				641	ordinal integers or None (causing deletion of the character).
				642
				643	Mapping tables may be dictionaries or sequences. Unmapped character
				644	ordinals (ones which cause a LookupError) are left untouched and
				645	are copied as-is.
				646
				647	*/
				648
				649	extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
				650	const Py_UNICODE data, / Unicode char buffer */
				651	int length, /* Number of Py_UNICODE chars to encode */
				652	PyObject table, / Translate table */
				653	const char errors / error handling */
				654	);
				655
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	656	#ifdef MS_WIN32
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	657
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	658	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	659
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	660	extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
				661	const char string, / MBCS encoded string */
				662	int length, /* size of string */
				663	const char errors / error handling */
				664	);
				665
				666	extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
				667	PyObject unicode / Unicode object */
				668	);
				669
				670	extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
				671	const Py_UNICODE data, / Unicode char buffer */
				672	int length, /* Number of Py_UNICODE chars to encode */
				673	const char errors / error handling */
				674	);
				675
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	676	#endif /* MS_WIN32 */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	677
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	678	/* --- Decimal Encoder ---------------------------------------------------- */
				679
				680	/* Takes a Unicode string holding a decimal value and writes it into
				681	an output buffer using standard ASCII digit codes.
				682
				683	The output buffer has to provide at least length+1 bytes of storage
				684	area. The output string is 0-terminated.
				685
				686	The encoder converts whitespace to ' ', decimal characters to their
				687	corresponding ASCII digit and all other Latin-1 characters except
				688	\0 as-is. Characters outside this range (Unicode ordinals 1-256)
				689	are treated as errors. This includes embedded NULL bytes.
				690
				691	Error handling is defined by the errors argument:
				692
				693	NULL or "strict": raise a ValueError
				694	"ignore": ignore the wrong characters (these are not copied to the
				695	output buffer)
				696	"replace": replaces illegal characters with '?'
				697
				698	Returns 0 on success, -1 on failure.
				699
				700	*/
				701
				702	extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
				703	Py_UNICODE s, / Unicode buffer */
				704	int length, /* Number of Py_UNICODE chars to encode */
				705	char output, / Output buffer; must have size >= length */
				706	const char errors / error handling */
				707	);
				708
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	709	/* --- Methods & Slots ----------------------------------------------------
				710
				711	These are capable of handling Unicode objects and strings on input
				712	(we refer to them as strings in the descriptions) and return
				713	Unicode objects or integers as apporpriate. */
				714
				715	/* Concat two strings giving a new Unicode string. */
				716
				717	extern DL_IMPORT(PyObject*) PyUnicode_Concat(
				718	PyObject left, / Left string */
				719	PyObject right / Right string */
				720	);
				721
				722	/* Split a string giving a list of Unicode strings.
				723
				724	If sep is NULL, splitting will be done at all whitespace
				725	substrings. Otherwise, splits occur at the given separator.
				726
				727	At most maxsplit splits will be done. If negative, no limit is set.
				728
				729	Separators are not included in the resulting list.
				730
				731	*/
				732
				733	extern DL_IMPORT(PyObject*) PyUnicode_Split(
				734	PyObject s, / String to split */
				735	PyObject sep, / String separator */
				736	int maxsplit /* Maxsplit count */
				737	);
				738
				739	/* Dito, but split at line breaks.
				740
				741	CRLF is considered to be one line break. Line breaks are not
				742	included in the resulting list. */
				743
				744	extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
				745	PyObject s, / String to split */
Guido van Rossum	004d64f	2000-04-11 15:39:46 +0000	[diff] [blame]	746	int keepends /* If true, line end markers are included */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	747	);
				748
				749	/* Translate a string by applying a character mapping table to it and
				750	return the resulting Unicode object.
				751
				752	The mapping table must map Unicode ordinal integers to Unicode
				753	ordinal integers or None (causing deletion of the character).
				754
				755	Mapping tables may be dictionaries or sequences. Unmapped character
				756	ordinals (ones which cause a LookupError) are left untouched and
				757	are copied as-is.
				758
				759	*/
				760
				761	extern DL_IMPORT(PyObject *) PyUnicode_Translate(
				762	PyObject str, / String */
				763	PyObject table, / Translate table */
				764	const char errors / error handling */
				765	);
				766
				767	/* Join a sequence of strings using the given separator and return
				768	the resulting Unicode string. */
				769
				770	extern DL_IMPORT(PyObject*) PyUnicode_Join(
				771	PyObject separator, / Separator string */
				772	PyObject seq / Sequence object */
				773	);
				774
				775	/* Return 1 if substr matches str[start:end] at the given tail end, 0
				776	otherwise. */
				777
				778	extern DL_IMPORT(int) PyUnicode_Tailmatch(
				779	PyObject str, / String */
				780	PyObject substr, / Prefix or Suffix string */
				781	int start, /* Start index */
				782	int end, /* Stop index */
				783	int direction /* Tail end: -1 prefix, +1 suffix */
				784	);
				785
				786	/* Return the first position of substr in str[start:end] using the
				787	given search direction or -1 if not found. */
				788
				789	extern DL_IMPORT(int) PyUnicode_Find(
				790	PyObject str, / String */
				791	PyObject substr, / Substring to find */
				792	int start, /* Start index */
				793	int end, /* Stop index */
				794	int direction /* Find direction: +1 forward, -1 backward */
				795	);
				796
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	797	/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	798
				799	extern DL_IMPORT(int) PyUnicode_Count(
				800	PyObject str, / String */
				801	PyObject substr, / Substring to count */
				802	int start, /* Start index */
				803	int end /* Stop index */
				804	);
				805
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	806	/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	807	and return the resulting Unicode object. */
				808
				809	extern DL_IMPORT(PyObject *) PyUnicode_Replace(
				810	PyObject str, / String */
				811	PyObject substr, / Substring to find */
				812	PyObject replstr, / Substring to replace */
				813	int maxcount /* Max. number of replacements to apply;
				814	-1 = all */
				815	);
				816
				817	/* Compare two strings and return -1, 0, 1 for less than, equal,
				818	greater than resp. */
				819
				820	extern DL_IMPORT(int) PyUnicode_Compare(
				821	PyObject left, / Left string */
				822	PyObject right / Right string */
				823	);
				824
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	825	/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	826	the resulting Unicode string. */
				827
				828	extern DL_IMPORT(PyObject *) PyUnicode_Format(
				829	PyObject format, / Format string */
				830	PyObject args / Argument tuple or dictionary */
				831	);
				832
Guido van Rossum	d0d366b	2000-03-13 23:22:24 +0000	[diff] [blame]	833	/* Checks whether element is contained in container and return 1/0
				834	accordingly.
				835
				836	element has to coerce to an one element Unicode string. -1 is
				837	returned in case of an error. */
				838
				839	extern DL_IMPORT(int) PyUnicode_Contains(
				840	PyObject container, / Container string */
				841	PyObject element / Element string */
				842	);
				843
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	844	/* === Characters Type APIs =============================================== */
				845
				846	/* These should not be used directly. Use the Py_UNICODE_IS* and
				847	Py_UNICODE_TO* macros instead.
				848
				849	These APIs are implemented in Objects/unicodectype.c.
				850
				851	*/
				852
				853	extern DL_IMPORT(int) _PyUnicode_IsLowercase(
				854	register const Py_UNICODE ch /* Unicode character */
				855	);
				856
				857	extern DL_IMPORT(int) _PyUnicode_IsUppercase(
				858	register const Py_UNICODE ch /* Unicode character */
				859	);
				860
				861	extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
				862	register const Py_UNICODE ch /* Unicode character */
				863	);
				864
				865	extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
				866	register const Py_UNICODE ch /* Unicode character */
				867	);
				868
				869	extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
				870	register const Py_UNICODE ch /* Unicode character */
				871	);
				872
				873	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
				874	register const Py_UNICODE ch /* Unicode character */
				875	);
				876
				877	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
				878	register const Py_UNICODE ch /* Unicode character */
				879	);
				880
				881	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
				882	register const Py_UNICODE ch /* Unicode character */
				883	);
				884
				885	extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
				886	register const Py_UNICODE ch /* Unicode character */
				887	);
				888
				889	extern DL_IMPORT(int) _PyUnicode_ToDigit(
				890	register const Py_UNICODE ch /* Unicode character */
				891	);
				892
				893	extern DL_IMPORT(double) _PyUnicode_ToNumeric(
				894	register const Py_UNICODE ch /* Unicode character */
				895	);
				896
				897	extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
				898	register const Py_UNICODE ch /* Unicode character */
				899	);
				900
				901	extern DL_IMPORT(int) _PyUnicode_IsDigit(
				902	register const Py_UNICODE ch /* Unicode character */
				903	);
				904
				905	extern DL_IMPORT(int) _PyUnicode_IsNumeric(
				906	register const Py_UNICODE ch /* Unicode character */
				907	);
				908
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	909	extern DL_IMPORT(int) _PyUnicode_IsAlpha(
				910	register const Py_UNICODE ch /* Unicode character */
				911	);
				912
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	913	#ifdef __cplusplus
				914	}
				915	#endif
				916	#endif /* !Py_UNICODEOBJECT_H */