Blame - Include/unicodeobject.h - platform/external/python/cpython3

blob: 01dce9469c34c5c1d71c1c93f6ca5b58b2a02653 [file] [log] [blame]

Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	1	#ifndef Py_UNICODEOBJECT_H
				2	#define Py_UNICODEOBJECT_H
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	3
				4	/*
				5
				6	Unicode implementation based on original code by Fredrik Lundh,
				7	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				8	Unicode Integration Proposal (see file Misc/unicode.txt).
				9
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	10	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	11
				12
				13	Original header:
				14	--------------------------------------------------------------------
				15
				16	* Yet another Unicode string type for Python. This type supports the
				17	* 16-bit Basic Multilingual Plane (BMP) only.
				18	*
				19	* Written by Fredrik Lundh, January 1999.
				20	*
				21	* Copyright (c) 1999 by Secret Labs AB.
				22	* Copyright (c) 1999 by Fredrik Lundh.
				23	*
				24	* fredrik@pythonware.com
				25	* http://www.pythonware.com
				26	*
				27	* --------------------------------------------------------------------
				28	* This Unicode String Type is
				29	*
				30	* Copyright (c) 1999 by Secret Labs AB
				31	* Copyright (c) 1999 by Fredrik Lundh
				32	*
				33	* By obtaining, using, and/or copying this software and/or its
				34	* associated documentation, you agree that you have read, understood,
				35	* and will comply with the following terms and conditions:
				36	*
				37	* Permission to use, copy, modify, and distribute this software and its
				38	* associated documentation for any purpose and without fee is hereby
				39	* granted, provided that the above copyright notice appears in all
				40	* copies, and that both that copyright notice and this permission notice
				41	* appear in supporting documentation, and that the name of Secret Labs
				42	* AB or the author not be used in advertising or publicity pertaining to
				43	* distribution of the software without specific, written prior
				44	* permission.
				45	*
				46	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				47	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				48	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				49	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				50	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				51	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				52	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				53	* -------------------------------------------------------------------- */
				54
				55	#include "ctype.h"
				56
				57	/* === Internal API ======================================================= */
				58
				59	/* --- Internal Unicode Format -------------------------------------------- */
				60
				61	/* Set these flags if the platform has "wchar.h", "wctype.h" and the
				62	wchar_t type is a 16-bit unsigned type */
				63	/* #define HAVE_WCHAR_H */
				64	/* #define HAVE_USABLE_WCHAR_T */
				65
				66	/* Defaults for various platforms */
				67	#ifndef HAVE_USABLE_WCHAR_T
				68
				69	/* Windows has a usable wchar_t type */
				70	# if defined(MS_WIN32)
				71	# define HAVE_USABLE_WCHAR_T
				72	# endif
				73
				74	#endif
				75
				76	/* If the compiler provides a wchar_t type we try to support it
				77	through the interface functions PyUnicode_FromWideChar() and
				78	PyUnicode_AsWideChar(). */
				79
				80	#ifdef HAVE_USABLE_WCHAR_T
				81	# define HAVE_WCHAR_H
				82	#endif
				83
				84	#ifdef HAVE_WCHAR_H
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	85	/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
				86	# ifdef _HAVE_BSDI
				87	# include <time.h>
				88	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	89	# include "wchar.h"
				90	#endif
				91
				92	#ifdef HAVE_USABLE_WCHAR_T
				93
				94	/* If the compiler defines whcar_t as a 16-bit unsigned type we can
				95	use the compiler type directly. Works fine with all modern Windows
				96	platforms. */
				97
				98	typedef wchar_t Py_UNICODE;
				99
				100	#else
				101
				102	/* Use if you have a standard ANSI compiler, without wchar_t support.
				103	If a short is not 16 bits on your platform, you have to fix the
				104	typedef below, or the module initialization code will complain. */
				105
				106	typedef unsigned short Py_UNICODE;
				107
				108	#endif
				109
Marc-André Lemburg	4327910	2000-07-07 09:01:41 +0000	[diff] [blame]	110	/*
				111	* Use this typedef when you need to represent a UTF-16 surrogate pair
				112	* as single unsigned integer.
				113	*/
				114	#if SIZEOF_INT >= 4
				115	typedef unsigned int Py_UCS4;
				116	#elif SIZEOF_LONG >= 4
				117	typedef unsigned long Py_UCS4;
				118	#endif
				119
				120
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	121	/* --- Internal Unicode Operations ---------------------------------------- */
				122
				123	/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	124	of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
				125	configure Python using --with-ctype-functions. This reduces the
				126	interpreter's code size. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	127
				128	#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
				129
				130	#include "wctype.h"
				131
				132	#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
				133
				134	#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
				135	#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
				136	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				137	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				138
				139	#define Py_UNICODE_TOLOWER(ch) towlower(ch)
				140	#define Py_UNICODE_TOUPPER(ch) towupper(ch)
				141	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				142
				143	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				144	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				145	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				146
				147	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				148	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				149	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				150
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	151	#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
				152
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	153	#else
				154
				155	#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
				156
				157	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
				158	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
				159	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				160	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				161
				162	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
				163	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
				164	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				165
				166	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				167	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				168	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				169
				170	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				171	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				172	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				173
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	174	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	175
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	176	#endif
Marc-André Lemburg	a9c103b	2000-07-03 10:52:13 +0000	[diff] [blame]	177
				178	#define Py_UNICODE_ISALNUM(ch) \
				179	(Py_UNICODE_ISALPHA(ch) \|\| \
				180	Py_UNICODE_ISDECIMAL(ch) \|\| \
				181	Py_UNICODE_ISDIGIT(ch) \|\| \
				182	Py_UNICODE_ISNUMERIC(ch))
				183
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	184	#define Py_UNICODE_COPY(target, source, length)\
				185	(memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
				186
				187	#define Py_UNICODE_FILL(target, value, length) do\
				188	{int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
				189	while (0)
				190
				191	#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg	2f4d0e9	2000-06-18 22:22:27 +0000	[diff] [blame]	192	((((string)->str + (offset)) == ((substring)->str)) &&\
				193	!memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	194	(substring)->length*sizeof(Py_UNICODE)))
				195
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	196	#ifdef __cplusplus
				197	extern "C" {
				198	#endif
				199
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	200	/* --- Unicode Type ------------------------------------------------------- */
				201
				202	typedef struct {
				203	PyObject_HEAD
				204	int length; /* Length of raw Unicode data in buffer */
				205	Py_UNICODE str; / Raw Unicode buffer */
				206	long hash; /* Hash value; -1 if not set */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	207	PyObject defenc; / (Default) Encoded version as Python
				208	string, or NULL; this is used for
				209	implementing the buffer protocol */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	210	} PyUnicodeObject;
				211
				212	extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
				213
				214	#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
				215
				216	/* Fast access macros */
				217	#define PyUnicode_GET_SIZE(op) \
				218	(((PyUnicodeObject *)(op))->length)
				219	#define PyUnicode_GET_DATA_SIZE(op) \
				220	(((PyUnicodeObject )(op))->length sizeof(Py_UNICODE))
				221	#define PyUnicode_AS_UNICODE(op) \
				222	(((PyUnicodeObject *)(op))->str)
				223	#define PyUnicode_AS_DATA(op) \
				224	((const char )((PyUnicodeObject )(op))->str)
				225
				226	/* --- Constants ---------------------------------------------------------- */
				227
				228	/* This Unicode character will be used as replacement character during
				229	decoding if the errors argument is set to "replace". Note: the
				230	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
				231	Unicode 3.0. */
				232
				233	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
				234
				235	/* === Public API ========================================================= */
				236
				237	/* --- Plain Py_UNICODE --------------------------------------------------- */
				238
				239	/* Create a Unicode Object from the Py_UNICODE buffer u of the given
				240	size. u may be NULL which causes the contents to be undefined. It
				241	is the user's responsibility to fill in the needed data.
				242
				243	The buffer is copied into the new object. */
				244
				245	extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
				246	const Py_UNICODE u, / Unicode buffer */
				247	int size /* size of buffer */
				248	);
				249
				250	/* Return a read-only pointer to the Unicode object's internal
				251	Py_UNICODE buffer. */
				252
				253	extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
				254	PyObject unicode / Unicode object */
				255	);
				256
				257	/* Get the length of the Unicode object. */
				258
				259	extern DL_IMPORT(int) PyUnicode_GetSize(
				260	PyObject unicode / Unicode object */
				261	);
				262
Guido van Rossum	52c2359	2000-04-10 13:41:41 +0000	[diff] [blame]	263	/* Resize an already allocated Unicode object to the new size length.
				264
				265	*unicode is modified to point to the new (resized) object and 0
				266	returned on success.
				267
				268	This API may only be called by the function which also called the
				269	Unicode constructor. The refcount on the object must be 1. Otherwise,
				270	an error is returned.
				271
				272	Error handling is implemented as follows: an exception is set, -1
				273	is returned and *unicode left untouched.
				274
				275	*/
				276
				277	extern DL_IMPORT(int) PyUnicode_Resize(
				278	PyObject *unicode, / Pointer to the Unicode object */
				279	int length /* New length */
				280	);
				281
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	282	/* Coerce obj to an Unicode object and return a reference with
				283	incremented refcount.
				284
				285	Coercion is done in the following way:
				286
				287	1. Unicode objects are passed back as-is with incremented
				288	refcount.
				289
				290	2. String and other char buffer compatible objects are decoded
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	291	under the assumptions that they contain data using the current
				292	default encoding. Decoding is done in "strict" mode.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	293
				294	3. All other objects raise an exception.
				295
				296	The API returns NULL in case of an error. The caller is responsible
				297	for decref'ing the returned objects.
				298
				299	*/
				300
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	301	extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
				302	register PyObject obj, / Object */
				303	const char encoding, / encoding */
				304	const char errors / error handling */
				305	);
				306
				307	/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
				308	which results in using the default encoding as basis for
				309	decoding the object.
				310
				311	Coerces obj to an Unicode object and return a reference with
				312	incremented refcount.
				313
				314	The API returns NULL in case of an error. The caller is responsible
				315	for decref'ing the returned objects.
				316
				317	*/
				318
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	319	extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
				320	register PyObject obj / Object */
				321	);
				322
				323	/* --- wchar_t support for platforms which support it --------------------- */
				324
				325	#ifdef HAVE_WCHAR_H
				326
				327	/* Create a Unicode Object from the whcar_t buffer w of the given
				328	size.
				329
				330	The buffer is copied into the new object. */
				331
				332	extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
				333	register const wchar_t w, / wchar_t buffer */
				334	int size /* size of buffer */
				335	);
				336
				337	/* Copies the Unicode Object contents into the whcar_t buffer w. At
				338	most size wchar_t characters are copied.
				339
				340	Returns the number of wchar_t characters copied or -1 in case of an
				341	error. */
				342
				343	extern DL_IMPORT(int) PyUnicode_AsWideChar(
				344	PyUnicodeObject unicode, / Unicode object */
				345	register wchar_t w, / wchar_t buffer */
				346	int size /* size of buffer */
				347	);
				348
				349	#endif
				350
				351	/* === Builtin Codecs =====================================================
				352
				353	Many of these APIs take two arguments encoding and errors. These
				354	parameters encoding and errors have the same semantics as the ones
				355	of the builtin unicode() API.
				356
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	357	Setting encoding to NULL causes the default encoding to be used.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	358
				359	Error handling is set by errors which may also be set to NULL
				360	meaning to use the default handling defined for the codec. Default
				361	error handling for all builtin codecs is "strict" (ValueErrors are
				362	raised).
				363
				364	The codecs all use a similar interface. Only deviation from the
				365	generic ones are documented.
				366
				367	*/
				368
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	369	/* --- Manage the default encoding ---------------------------------------- */
				370
				371	/* Returns the currently active default encoding.
				372
				373	The default encoding is currently implemented as run-time settable
				374	process global. This may change in future versions of the
				375	interpreter to become a parameter which is managed on a per-thread
				376	basis.
				377
				378	*/
				379
Thomas Wouters	5f37591	2000-07-22 23:30:03 +0000	[diff] [blame]	380	extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	381
				382	/* Sets the currently active default encoding.
				383
				384	Returns 0 on success, -1 in case of an error.
				385
				386	*/
				387
				388	extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
				389	const char encoding / Encoding name in standard form */
				390	);
				391
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	392	/* --- Generic Codecs ----------------------------------------------------- */
				393
				394	/* Create a Unicode object by decoding the encoded string s of the
				395	given size. */
				396
				397	extern DL_IMPORT(PyObject*) PyUnicode_Decode(
				398	const char s, / encoded string */
				399	int size, /* size of buffer */
				400	const char encoding, / encoding */
				401	const char errors / error handling */
				402	);
				403
				404	/* Encodes a Py_UNICODE buffer of the given size and returns a
				405	Python string object. */
				406
				407	extern DL_IMPORT(PyObject*) PyUnicode_Encode(
				408	const Py_UNICODE s, / Unicode char buffer */
				409	int size, /* number of Py_UNICODE chars to encode */
				410	const char encoding, / encoding */
				411	const char errors / error handling */
				412	);
				413
				414	/* Encodes a Unicode object and returns the result as Python string
				415	object. */
				416
				417	extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
				418	PyObject unicode, / Unicode object */
				419	const char encoding, / encoding */
				420	const char errors / error handling */
				421	);
				422
				423	/* --- UTF-8 Codecs ------------------------------------------------------- */
				424
				425	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
				426	const char string, / UTF-8 encoded string */
				427	int length, /* size of string */
				428	const char errors / error handling */
				429	);
				430
				431	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
				432	PyObject unicode / Unicode object */
				433	);
				434
				435	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
				436	const Py_UNICODE data, / Unicode char buffer */
				437	int length, /* number of Py_UNICODE chars to encode */
				438	const char errors / error handling */
				439	);
				440
				441	/* --- UTF-16 Codecs ------------------------------------------------------ */
				442
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	443	/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	444	the corresponding Unicode object.
				445
				446	errors (if non-NULL) defines the error handling. It defaults
				447	to "strict".
				448
				449	If byteorder is non-NULL, the decoder starts decoding using the
				450	given byte order:
				451
				452	*byteorder == -1: little endian
				453	*byteorder == 0: native order
				454	*byteorder == 1: big endian
				455
				456	and then switches according to all BOM marks it finds in the input
				457	data. BOM marks are not copied into the resulting Unicode string.
				458	After completion, *byteorder is set to the current byte order at
				459	the end of input data.
				460
				461	If byteorder is NULL, the codec starts in native order mode.
				462
				463	*/
				464
				465	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
				466	const char string, / UTF-16 encoded string */
				467	int length, /* size of string */
				468	const char errors, / error handling */
				469	int byteorder / pointer to byteorder to use
				470	0=native;-1=LE,1=BE; updated on
				471	exit */
				472	);
				473
				474	/* Returns a Python string using the UTF-16 encoding in native byte
				475	order. The string always starts with a BOM mark. */
				476
				477	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
				478	PyObject unicode / Unicode object */
				479	);
				480
				481	/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	482	the Unicode data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	483
				484	If byteorder is not 0, output is written according to the following
				485	byte order:
				486
				487	byteorder == -1: little endian
				488	byteorder == 0: native byte order (writes a BOM mark)
				489	byteorder == 1: big endian
				490
				491	If byteorder is 0, the output string will always start with the
				492	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
				493	prepended.
				494
				495	Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
				496	UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	497	at a later point without compromising the APIs.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	498
				499	*/
				500
				501	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
				502	const Py_UNICODE data, / Unicode char buffer */
				503	int length, /* number of Py_UNICODE chars to encode */
				504	const char errors, / error handling */
				505	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				506	);
				507
				508	/* --- Unicode-Escape Codecs ---------------------------------------------- */
				509
				510	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
				511	const char string, / Unicode-Escape encoded string */
				512	int length, /* size of string */
				513	const char errors / error handling */
				514	);
				515
				516	extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
				517	PyObject unicode / Unicode object */
				518	);
				519
				520	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
				521	const Py_UNICODE data, / Unicode char buffer */
				522	int length /* Number of Py_UNICODE chars to encode */
				523	);
				524
				525	/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
				526
				527	extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
				528	const char string, / Raw-Unicode-Escape encoded string */
				529	int length, /* size of string */
				530	const char errors / error handling */
				531	);
				532
				533	extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
				534	PyObject unicode / Unicode object */
				535	);
				536
				537	extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
				538	const Py_UNICODE data, / Unicode char buffer */
				539	int length /* Number of Py_UNICODE chars to encode */
				540	);
				541
				542	/* --- Latin-1 Codecs -----------------------------------------------------
				543
				544	Note: Latin-1 corresponds to the first 256 Unicode ordinals.
				545
				546	*/
				547
				548	extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
				549	const char string, / Latin-1 encoded string */
				550	int length, /* size of string */
				551	const char errors / error handling */
				552	);
				553
				554	extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
				555	PyObject unicode / Unicode object */
				556	);
				557
				558	extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
				559	const Py_UNICODE data, / Unicode char buffer */
				560	int length, /* Number of Py_UNICODE chars to encode */
				561	const char errors / error handling */
				562	);
				563
				564	/* --- ASCII Codecs -------------------------------------------------------
				565
				566	Only 7-bit ASCII data is excepted. All other codes generate errors.
				567
				568	*/
				569
				570	extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
				571	const char string, / ASCII encoded string */
				572	int length, /* size of string */
				573	const char errors / error handling */
				574	);
				575
				576	extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
				577	PyObject unicode / Unicode object */
				578	);
				579
				580	extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
				581	const Py_UNICODE data, / Unicode char buffer */
				582	int length, /* Number of Py_UNICODE chars to encode */
				583	const char errors / error handling */
				584	);
				585
				586	/* --- Character Map Codecs -----------------------------------------------
				587
				588	This codec uses mappings to encode and decode characters.
				589
				590	Decoding mappings must map single string characters to single
				591	Unicode characters, integers (which are then interpreted as Unicode
				592	ordinals) or None (meaning "undefined mapping" and causing an
				593	error).
				594
				595	Encoding mappings must map single Unicode characters to single
				596	string characters, integers (which are then interpreted as Latin-1
				597	ordinals) or None (meaning "undefined mapping" and causing an
				598	error).
				599
				600	If a character lookup fails with a LookupError, the character is
				601	copied as-is meaning that its ordinal value will be interpreted as
				602	Unicode or Latin-1 ordinal resp. Because of this mappings only need
				603	to contain those mappings which map characters to different code
				604	points.
				605
				606	*/
				607
				608	extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
				609	const char string, / Encoded string */
				610	int length, /* size of string */
				611	PyObject mapping, / character mapping
				612	(char ordinal -> unicode ordinal) */
				613	const char errors / error handling */
				614	);
				615
				616	extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
				617	PyObject unicode, / Unicode object */
				618	PyObject mapping / character mapping
				619	(unicode ordinal -> char ordinal) */
				620	);
				621
				622	extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
				623	const Py_UNICODE data, / Unicode char buffer */
				624	int length, /* Number of Py_UNICODE chars to encode */
				625	PyObject mapping, / character mapping
				626	(unicode ordinal -> char ordinal) */
				627	const char errors / error handling */
				628	);
				629
				630	/* Translate a Py_UNICODE buffer of the given length by applying a
				631	character mapping table to it and return the resulting Unicode
				632	object.
				633
				634	The mapping table must map Unicode ordinal integers to Unicode
				635	ordinal integers or None (causing deletion of the character).
				636
				637	Mapping tables may be dictionaries or sequences. Unmapped character
				638	ordinals (ones which cause a LookupError) are left untouched and
				639	are copied as-is.
				640
				641	*/
				642
				643	extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
				644	const Py_UNICODE data, / Unicode char buffer */
				645	int length, /* Number of Py_UNICODE chars to encode */
				646	PyObject table, / Translate table */
				647	const char errors / error handling */
				648	);
				649
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	650	#ifdef MS_WIN32
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	651
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	652	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	653
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	654	extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
				655	const char string, / MBCS encoded string */
				656	int length, /* size of string */
				657	const char errors / error handling */
				658	);
				659
				660	extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
				661	PyObject unicode / Unicode object */
				662	);
				663
				664	extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
				665	const Py_UNICODE data, / Unicode char buffer */
				666	int length, /* Number of Py_UNICODE chars to encode */
				667	const char errors / error handling */
				668	);
				669
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	670	#endif /* MS_WIN32 */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	671
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	672	/* --- Decimal Encoder ---------------------------------------------------- */
				673
				674	/* Takes a Unicode string holding a decimal value and writes it into
				675	an output buffer using standard ASCII digit codes.
				676
				677	The output buffer has to provide at least length+1 bytes of storage
				678	area. The output string is 0-terminated.
				679
				680	The encoder converts whitespace to ' ', decimal characters to their
				681	corresponding ASCII digit and all other Latin-1 characters except
				682	\0 as-is. Characters outside this range (Unicode ordinals 1-256)
				683	are treated as errors. This includes embedded NULL bytes.
				684
				685	Error handling is defined by the errors argument:
				686
				687	NULL or "strict": raise a ValueError
				688	"ignore": ignore the wrong characters (these are not copied to the
				689	output buffer)
				690	"replace": replaces illegal characters with '?'
				691
				692	Returns 0 on success, -1 on failure.
				693
				694	*/
				695
				696	extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
				697	Py_UNICODE s, / Unicode buffer */
				698	int length, /* Number of Py_UNICODE chars to encode */
				699	char output, / Output buffer; must have size >= length */
				700	const char errors / error handling */
				701	);
				702
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	703	/* --- Methods & Slots ----------------------------------------------------
				704
				705	These are capable of handling Unicode objects and strings on input
				706	(we refer to them as strings in the descriptions) and return
				707	Unicode objects or integers as apporpriate. */
				708
				709	/* Concat two strings giving a new Unicode string. */
				710
				711	extern DL_IMPORT(PyObject*) PyUnicode_Concat(
				712	PyObject left, / Left string */
				713	PyObject right / Right string */
				714	);
				715
				716	/* Split a string giving a list of Unicode strings.
				717
				718	If sep is NULL, splitting will be done at all whitespace
				719	substrings. Otherwise, splits occur at the given separator.
				720
				721	At most maxsplit splits will be done. If negative, no limit is set.
				722
				723	Separators are not included in the resulting list.
				724
				725	*/
				726
				727	extern DL_IMPORT(PyObject*) PyUnicode_Split(
				728	PyObject s, / String to split */
				729	PyObject sep, / String separator */
				730	int maxsplit /* Maxsplit count */
				731	);
				732
				733	/* Dito, but split at line breaks.
				734
				735	CRLF is considered to be one line break. Line breaks are not
				736	included in the resulting list. */
				737
				738	extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
				739	PyObject s, / String to split */
Guido van Rossum	004d64f	2000-04-11 15:39:46 +0000	[diff] [blame]	740	int keepends /* If true, line end markers are included */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	741	);
				742
				743	/* Translate a string by applying a character mapping table to it and
				744	return the resulting Unicode object.
				745
				746	The mapping table must map Unicode ordinal integers to Unicode
				747	ordinal integers or None (causing deletion of the character).
				748
				749	Mapping tables may be dictionaries or sequences. Unmapped character
				750	ordinals (ones which cause a LookupError) are left untouched and
				751	are copied as-is.
				752
				753	*/
				754
				755	extern DL_IMPORT(PyObject *) PyUnicode_Translate(
				756	PyObject str, / String */
				757	PyObject table, / Translate table */
				758	const char errors / error handling */
				759	);
				760
				761	/* Join a sequence of strings using the given separator and return
				762	the resulting Unicode string. */
				763
				764	extern DL_IMPORT(PyObject*) PyUnicode_Join(
				765	PyObject separator, / Separator string */
				766	PyObject seq / Sequence object */
				767	);
				768
				769	/* Return 1 if substr matches str[start:end] at the given tail end, 0
				770	otherwise. */
				771
				772	extern DL_IMPORT(int) PyUnicode_Tailmatch(
				773	PyObject str, / String */
				774	PyObject substr, / Prefix or Suffix string */
				775	int start, /* Start index */
				776	int end, /* Stop index */
				777	int direction /* Tail end: -1 prefix, +1 suffix */
				778	);
				779
				780	/* Return the first position of substr in str[start:end] using the
				781	given search direction or -1 if not found. */
				782
				783	extern DL_IMPORT(int) PyUnicode_Find(
				784	PyObject str, / String */
				785	PyObject substr, / Substring to find */
				786	int start, /* Start index */
				787	int end, /* Stop index */
				788	int direction /* Find direction: +1 forward, -1 backward */
				789	);
				790
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	791	/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	792
				793	extern DL_IMPORT(int) PyUnicode_Count(
				794	PyObject str, / String */
				795	PyObject substr, / Substring to count */
				796	int start, /* Start index */
				797	int end /* Stop index */
				798	);
				799
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	800	/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	801	and return the resulting Unicode object. */
				802
				803	extern DL_IMPORT(PyObject *) PyUnicode_Replace(
				804	PyObject str, / String */
				805	PyObject substr, / Substring to find */
				806	PyObject replstr, / Substring to replace */
				807	int maxcount /* Max. number of replacements to apply;
				808	-1 = all */
				809	);
				810
				811	/* Compare two strings and return -1, 0, 1 for less than, equal,
				812	greater than resp. */
				813
				814	extern DL_IMPORT(int) PyUnicode_Compare(
				815	PyObject left, / Left string */
				816	PyObject right / Right string */
				817	);
				818
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	819	/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	820	the resulting Unicode string. */
				821
				822	extern DL_IMPORT(PyObject *) PyUnicode_Format(
				823	PyObject format, / Format string */
				824	PyObject args / Argument tuple or dictionary */
				825	);
				826
Guido van Rossum	d0d366b	2000-03-13 23:22:24 +0000	[diff] [blame]	827	/* Checks whether element is contained in container and return 1/0
				828	accordingly.
				829
				830	element has to coerce to an one element Unicode string. -1 is
				831	returned in case of an error. */
				832
				833	extern DL_IMPORT(int) PyUnicode_Contains(
				834	PyObject container, / Container string */
				835	PyObject element / Element string */
				836	);
				837
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	838	/* === Characters Type APIs =============================================== */
				839
				840	/* These should not be used directly. Use the Py_UNICODE_IS* and
				841	Py_UNICODE_TO* macros instead.
				842
				843	These APIs are implemented in Objects/unicodectype.c.
				844
				845	*/
				846
				847	extern DL_IMPORT(int) _PyUnicode_IsLowercase(
				848	register const Py_UNICODE ch /* Unicode character */
				849	);
				850
				851	extern DL_IMPORT(int) _PyUnicode_IsUppercase(
				852	register const Py_UNICODE ch /* Unicode character */
				853	);
				854
				855	extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
				856	register const Py_UNICODE ch /* Unicode character */
				857	);
				858
				859	extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
				860	register const Py_UNICODE ch /* Unicode character */
				861	);
				862
				863	extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
				864	register const Py_UNICODE ch /* Unicode character */
				865	);
				866
				867	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
				868	register const Py_UNICODE ch /* Unicode character */
				869	);
				870
				871	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
				872	register const Py_UNICODE ch /* Unicode character */
				873	);
				874
				875	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
				876	register const Py_UNICODE ch /* Unicode character */
				877	);
				878
				879	extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
				880	register const Py_UNICODE ch /* Unicode character */
				881	);
				882
				883	extern DL_IMPORT(int) _PyUnicode_ToDigit(
				884	register const Py_UNICODE ch /* Unicode character */
				885	);
				886
				887	extern DL_IMPORT(double) _PyUnicode_ToNumeric(
				888	register const Py_UNICODE ch /* Unicode character */
				889	);
				890
				891	extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
				892	register const Py_UNICODE ch /* Unicode character */
				893	);
				894
				895	extern DL_IMPORT(int) _PyUnicode_IsDigit(
				896	register const Py_UNICODE ch /* Unicode character */
				897	);
				898
				899	extern DL_IMPORT(int) _PyUnicode_IsNumeric(
				900	register const Py_UNICODE ch /* Unicode character */
				901	);
				902
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	903	extern DL_IMPORT(int) _PyUnicode_IsAlpha(
				904	register const Py_UNICODE ch /* Unicode character */
				905	);
				906
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	907	#ifdef __cplusplus
				908	}
				909	#endif
				910	#endif /* !Py_UNICODEOBJECT_H */