Blame - Include/unicodeobject.h - platform/external/python/cpython3

blob: d89537fc91edbba59156c4939f2d2b30cdd19803 [file] [log] [blame]

Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	1	#ifndef Py_UNICODEOBJECT_H
				2	#define Py_UNICODEOBJECT_H
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	3
				4	/*
				5
				6	Unicode implementation based on original code by Fredrik Lundh,
				7	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				8	Unicode Integration Proposal (see file Misc/unicode.txt).
				9
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	10	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	11
				12
				13	Original header:
				14	--------------------------------------------------------------------
				15
				16	* Yet another Unicode string type for Python. This type supports the
				17	* 16-bit Basic Multilingual Plane (BMP) only.
				18	*
				19	* Written by Fredrik Lundh, January 1999.
				20	*
				21	* Copyright (c) 1999 by Secret Labs AB.
				22	* Copyright (c) 1999 by Fredrik Lundh.
				23	*
				24	* fredrik@pythonware.com
				25	* http://www.pythonware.com
				26	*
				27	* --------------------------------------------------------------------
				28	* This Unicode String Type is
				29	*
				30	* Copyright (c) 1999 by Secret Labs AB
				31	* Copyright (c) 1999 by Fredrik Lundh
				32	*
				33	* By obtaining, using, and/or copying this software and/or its
				34	* associated documentation, you agree that you have read, understood,
				35	* and will comply with the following terms and conditions:
				36	*
				37	* Permission to use, copy, modify, and distribute this software and its
				38	* associated documentation for any purpose and without fee is hereby
				39	* granted, provided that the above copyright notice appears in all
				40	* copies, and that both that copyright notice and this permission notice
				41	* appear in supporting documentation, and that the name of Secret Labs
				42	* AB or the author not be used in advertising or publicity pertaining to
				43	* distribution of the software without specific, written prior
				44	* permission.
				45	*
				46	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				47	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				48	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				49	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				50	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				51	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				52	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				53	* -------------------------------------------------------------------- */
				54
				55	#include "ctype.h"
				56
				57	/* === Internal API ======================================================= */
				58
				59	/* --- Internal Unicode Format -------------------------------------------- */
				60
Fredrik Lundh	9b14ab3	2001-06-26 22:59:49 +0000	[diff] [blame]	61	/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
				62	properly set, but the default rules below doesn't set it. I'll
				63	sort this out some other day -- fredrik@pythonware.com */
				64
				65	#ifndef Py_UNICODE_SIZE
				66	#error Must define Py_UNICODE_SIZE
				67	#endif
				68
Fredrik Lundh	1294ad0	2001-06-26 17:17:07 +0000	[diff] [blame]	69	/* experimental UCS-4 support. enable at your own risk! */
				70	#undef USE_UCS4_STORAGE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	71	#if Py_UNICODE_SIZE == 4
				72	#define USE_UCS4_STORAGE
				73	#endif
Fredrik Lundh	1294ad0	2001-06-26 17:17:07 +0000	[diff] [blame]	74
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	75	/* Set these flags if the platform has "wchar.h", "wctype.h" and the
				76	wchar_t type is a 16-bit unsigned type */
				77	/* #define HAVE_WCHAR_H */
				78	/* #define HAVE_USABLE_WCHAR_T */
				79
				80	/* Defaults for various platforms */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	81	#ifndef PY_UNICODE_TYPE
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	82
Fredrik Lundh	1294ad0	2001-06-26 17:17:07 +0000	[diff] [blame]	83	/* Windows has a usable wchar_t type (unless we're using UCS-4) */
				84	# if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	85	# define HAVE_USABLE_WCHAR_T
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	86	# define PY_UNICODE_TYPE wchar_t
				87	# endif
				88
				89	# if defined(USE_UCS4_STORAGE)
				90	# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	91	# endif
				92
				93	#endif
				94
				95	/* If the compiler provides a wchar_t type we try to support it
				96	through the interface functions PyUnicode_FromWideChar() and
				97	PyUnicode_AsWideChar(). */
				98
				99	#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg	1a731c6	2000-08-11 11:43:10 +0000	[diff] [blame]	100	# ifndef HAVE_WCHAR_H
				101	# define HAVE_WCHAR_H
				102	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	103	#endif
				104
				105	#ifdef HAVE_WCHAR_H
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	106	/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
				107	# ifdef _HAVE_BSDI
				108	# include <time.h>
				109	# endif
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	110	# include "wchar.h"
				111	#endif
				112
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	113	/*
				114	* Use this typedef when you need to represent a UTF-16 surrogate pair
				115	* as single unsigned integer.
				116	*/
				117	#if SIZEOF_INT >= 4
				118	typedef unsigned int Py_UCS4;
				119	#elif SIZEOF_LONG >= 4
				120	typedef unsigned long Py_UCS4;
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	121	#endif
				122
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	123	#if SIZEOF_SHORT == 2
				124	typedef unsigned short Py_UCS2;
				125	#else
				126	#error Cannot find a two-byte type
				127	#endif
Marc-André Lemburg	4327910	2000-07-07 09:01:41 +0000	[diff] [blame]	128
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	129	typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg	4327910	2000-07-07 09:01:41 +0000	[diff] [blame]	130
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	131	/* --- Internal Unicode Operations ---------------------------------------- */
				132
				133	/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	134	of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
				135	configure Python using --with-ctype-functions. This reduces the
				136	interpreter's code size. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	137
				138	#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
				139
				140	#include "wctype.h"
				141
				142	#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
				143
				144	#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
				145	#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
				146	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				147	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				148
				149	#define Py_UNICODE_TOLOWER(ch) towlower(ch)
				150	#define Py_UNICODE_TOUPPER(ch) towupper(ch)
				151	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				152
				153	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				154	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				155	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				156
				157	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				158	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				159	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				160
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	161	#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
				162
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	163	#else
				164
				165	#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
				166
				167	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
				168	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
				169	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				170	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				171
				172	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
				173	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
				174	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				175
				176	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				177	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				178	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				179
				180	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				181	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				182	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				183
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	184	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	185
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	186	#endif
Marc-André Lemburg	a9c103b	2000-07-03 10:52:13 +0000	[diff] [blame]	187
				188	#define Py_UNICODE_ISALNUM(ch) \
				189	(Py_UNICODE_ISALPHA(ch) \|\| \
				190	Py_UNICODE_ISDECIMAL(ch) \|\| \
				191	Py_UNICODE_ISDIGIT(ch) \|\| \
				192	Py_UNICODE_ISNUMERIC(ch))
				193
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	194	#define Py_UNICODE_COPY(target, source, length)\
				195	(memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
				196
				197	#define Py_UNICODE_FILL(target, value, length) do\
				198	{int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
				199	while (0)
				200
				201	#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg	2f4d0e9	2000-06-18 22:22:27 +0000	[diff] [blame]	202	((((string)->str + (offset)) == ((substring)->str)) &&\
				203	!memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	204	(substring)->length*sizeof(Py_UNICODE)))
				205
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	206	#ifdef __cplusplus
				207	extern "C" {
				208	#endif
				209
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	210	/* --- Unicode Type ------------------------------------------------------- */
				211
				212	typedef struct {
				213	PyObject_HEAD
				214	int length; /* Length of raw Unicode data in buffer */
				215	Py_UNICODE str; / Raw Unicode buffer */
				216	long hash; /* Hash value; -1 if not set */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	PyObject defenc; / (Default) Encoded version as Python
				218	string, or NULL; this is used for
				219	implementing the buffer protocol */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	220	} PyUnicodeObject;
				221
				222	extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
				223
				224	#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
				225
				226	/* Fast access macros */
				227	#define PyUnicode_GET_SIZE(op) \
				228	(((PyUnicodeObject *)(op))->length)
				229	#define PyUnicode_GET_DATA_SIZE(op) \
				230	(((PyUnicodeObject )(op))->length sizeof(Py_UNICODE))
				231	#define PyUnicode_AS_UNICODE(op) \
				232	(((PyUnicodeObject *)(op))->str)
				233	#define PyUnicode_AS_DATA(op) \
				234	((const char )((PyUnicodeObject )(op))->str)
				235
				236	/* --- Constants ---------------------------------------------------------- */
				237
				238	/* This Unicode character will be used as replacement character during
				239	decoding if the errors argument is set to "replace". Note: the
				240	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
				241	Unicode 3.0. */
				242
				243	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
				244
				245	/* === Public API ========================================================= */
				246
				247	/* --- Plain Py_UNICODE --------------------------------------------------- */
				248
				249	/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	250	size.
				251
				252	u may be NULL which causes the contents to be undefined. It is the
				253	user's responsibility to fill in the needed data afterwards. Note
				254	that modifying the Unicode object contents after construction is
				255	only allowed if u was set to NULL.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	256
				257	The buffer is copied into the new object. */
				258
				259	extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
				260	const Py_UNICODE u, / Unicode buffer */
				261	int size /* size of buffer */
				262	);
				263
				264	/* Return a read-only pointer to the Unicode object's internal
				265	Py_UNICODE buffer. */
				266
				267	extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
				268	PyObject unicode / Unicode object */
				269	);
				270
				271	/* Get the length of the Unicode object. */
				272
				273	extern DL_IMPORT(int) PyUnicode_GetSize(
				274	PyObject unicode / Unicode object */
				275	);
				276
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame^]	277	/* Get the maximum ordinal for a Unicode character. */
				278	extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
				279
Guido van Rossum	52c2359	2000-04-10 13:41:41 +0000	[diff] [blame]	280	/* Resize an already allocated Unicode object to the new size length.
				281
				282	*unicode is modified to point to the new (resized) object and 0
				283	returned on success.
				284
				285	This API may only be called by the function which also called the
				286	Unicode constructor. The refcount on the object must be 1. Otherwise,
				287	an error is returned.
				288
				289	Error handling is implemented as follows: an exception is set, -1
				290	is returned and *unicode left untouched.
				291
				292	*/
				293
				294	extern DL_IMPORT(int) PyUnicode_Resize(
				295	PyObject *unicode, / Pointer to the Unicode object */
				296	int length /* New length */
				297	);
				298
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	299	/* Coerce obj to an Unicode object and return a reference with
				300	incremented refcount.
				301
				302	Coercion is done in the following way:
				303
				304	1. Unicode objects are passed back as-is with incremented
				305	refcount.
				306
				307	2. String and other char buffer compatible objects are decoded
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	308	under the assumptions that they contain data using the current
				309	default encoding. Decoding is done in "strict" mode.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	310
				311	3. All other objects raise an exception.
				312
				313	The API returns NULL in case of an error. The caller is responsible
				314	for decref'ing the returned objects.
				315
				316	*/
				317
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	318	extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
				319	register PyObject obj, / Object */
				320	const char encoding, / encoding */
				321	const char errors / error handling */
				322	);
				323
				324	/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
				325	which results in using the default encoding as basis for
				326	decoding the object.
				327
				328	Coerces obj to an Unicode object and return a reference with
				329	incremented refcount.
				330
				331	The API returns NULL in case of an error. The caller is responsible
				332	for decref'ing the returned objects.
				333
				334	*/
				335
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	336	extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
				337	register PyObject obj / Object */
				338	);
				339
				340	/* --- wchar_t support for platforms which support it --------------------- */
				341
				342	#ifdef HAVE_WCHAR_H
				343
				344	/* Create a Unicode Object from the whcar_t buffer w of the given
				345	size.
				346
				347	The buffer is copied into the new object. */
				348
				349	extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
				350	register const wchar_t w, / wchar_t buffer */
				351	int size /* size of buffer */
				352	);
				353
				354	/* Copies the Unicode Object contents into the whcar_t buffer w. At
				355	most size wchar_t characters are copied.
				356
				357	Returns the number of wchar_t characters copied or -1 in case of an
				358	error. */
				359
				360	extern DL_IMPORT(int) PyUnicode_AsWideChar(
				361	PyUnicodeObject unicode, / Unicode object */
				362	register wchar_t w, / wchar_t buffer */
				363	int size /* size of buffer */
				364	);
				365
				366	#endif
				367
				368	/* === Builtin Codecs =====================================================
				369
				370	Many of these APIs take two arguments encoding and errors. These
				371	parameters encoding and errors have the same semantics as the ones
				372	of the builtin unicode() API.
				373
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	374	Setting encoding to NULL causes the default encoding to be used.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	375
				376	Error handling is set by errors which may also be set to NULL
				377	meaning to use the default handling defined for the codec. Default
				378	error handling for all builtin codecs is "strict" (ValueErrors are
				379	raised).
				380
				381	The codecs all use a similar interface. Only deviation from the
				382	generic ones are documented.
				383
				384	*/
				385
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	386	/* --- Manage the default encoding ---------------------------------------- */
				387
				388	/* Returns the currently active default encoding.
				389
				390	The default encoding is currently implemented as run-time settable
				391	process global. This may change in future versions of the
				392	interpreter to become a parameter which is managed on a per-thread
				393	basis.
				394
				395	*/
				396
Thomas Wouters	5f37591	2000-07-22 23:30:03 +0000	[diff] [blame]	397	extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drake	cb093fe	2000-05-09 19:51:53 +0000	[diff] [blame]	398
				399	/* Sets the currently active default encoding.
				400
				401	Returns 0 on success, -1 in case of an error.
				402
				403	*/
				404
				405	extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
				406	const char encoding / Encoding name in standard form */
				407	);
				408
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	409	/* --- Generic Codecs ----------------------------------------------------- */
				410
				411	/* Create a Unicode object by decoding the encoded string s of the
				412	given size. */
				413
				414	extern DL_IMPORT(PyObject*) PyUnicode_Decode(
				415	const char s, / encoded string */
				416	int size, /* size of buffer */
				417	const char encoding, / encoding */
				418	const char errors / error handling */
				419	);
				420
				421	/* Encodes a Py_UNICODE buffer of the given size and returns a
				422	Python string object. */
				423
				424	extern DL_IMPORT(PyObject*) PyUnicode_Encode(
				425	const Py_UNICODE s, / Unicode char buffer */
				426	int size, /* number of Py_UNICODE chars to encode */
				427	const char encoding, / encoding */
				428	const char errors / error handling */
				429	);
				430
				431	/* Encodes a Unicode object and returns the result as Python string
				432	object. */
				433
				434	extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
				435	PyObject unicode, / Unicode object */
				436	const char encoding, / encoding */
				437	const char errors / error handling */
				438	);
				439
				440	/* --- UTF-8 Codecs ------------------------------------------------------- */
				441
				442	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
				443	const char string, / UTF-8 encoded string */
				444	int length, /* size of string */
				445	const char errors / error handling */
				446	);
				447
				448	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
				449	PyObject unicode / Unicode object */
				450	);
				451
				452	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
				453	const Py_UNICODE data, / Unicode char buffer */
				454	int length, /* number of Py_UNICODE chars to encode */
				455	const char errors / error handling */
				456	);
				457
				458	/* --- UTF-16 Codecs ------------------------------------------------------ */
				459
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	460	/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	461	the corresponding Unicode object.
				462
				463	errors (if non-NULL) defines the error handling. It defaults
				464	to "strict".
				465
				466	If byteorder is non-NULL, the decoder starts decoding using the
				467	given byte order:
				468
				469	*byteorder == -1: little endian
				470	*byteorder == 0: native order
				471	*byteorder == 1: big endian
				472
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	473	In native mode, the first two bytes of the stream are checked for a
				474	BOM mark. If found, the BOM mark is analysed, the byte order
				475	adjusted and the BOM skipped. In the other modes, no BOM mark
				476	interpretation is done. After completion, *byteorder is set to the
				477	current byte order at the end of input data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	478
				479	If byteorder is NULL, the codec starts in native order mode.
				480
				481	*/
				482
				483	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
				484	const char string, / UTF-16 encoded string */
				485	int length, /* size of string */
				486	const char errors, / error handling */
				487	int byteorder / pointer to byteorder to use
				488	0=native;-1=LE,1=BE; updated on
				489	exit */
				490	);
				491
				492	/* Returns a Python string using the UTF-16 encoding in native byte
				493	order. The string always starts with a BOM mark. */
				494
				495	extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
				496	PyObject unicode / Unicode object */
				497	);
				498
				499	/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	500	the Unicode data.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	501
				502	If byteorder is not 0, output is written according to the following
				503	byte order:
				504
				505	byteorder == -1: little endian
				506	byteorder == 0: native byte order (writes a BOM mark)
				507	byteorder == 1: big endian
				508
				509	If byteorder is 0, the output string will always start with the
				510	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
				511	prepended.
				512
				513	Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
				514	UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	515	at a later point without compromising the APIs.
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	516
				517	*/
				518
				519	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
				520	const Py_UNICODE data, / Unicode char buffer */
				521	int length, /* number of Py_UNICODE chars to encode */
				522	const char errors, / error handling */
				523	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				524	);
				525
				526	/* --- Unicode-Escape Codecs ---------------------------------------------- */
				527
				528	extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
				529	const char string, / Unicode-Escape encoded string */
				530	int length, /* size of string */
				531	const char errors / error handling */
				532	);
				533
				534	extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
				535	PyObject unicode / Unicode object */
				536	);
				537
				538	extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
				539	const Py_UNICODE data, / Unicode char buffer */
				540	int length /* Number of Py_UNICODE chars to encode */
				541	);
				542
				543	/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
				544
				545	extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
				546	const char string, / Raw-Unicode-Escape encoded string */
				547	int length, /* size of string */
				548	const char errors / error handling */
				549	);
				550
				551	extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
				552	PyObject unicode / Unicode object */
				553	);
				554
				555	extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
				556	const Py_UNICODE data, / Unicode char buffer */
				557	int length /* Number of Py_UNICODE chars to encode */
				558	);
				559
				560	/* --- Latin-1 Codecs -----------------------------------------------------
				561
				562	Note: Latin-1 corresponds to the first 256 Unicode ordinals.
				563
				564	*/
				565
				566	extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
				567	const char string, / Latin-1 encoded string */
				568	int length, /* size of string */
				569	const char errors / error handling */
				570	);
				571
				572	extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
				573	PyObject unicode / Unicode object */
				574	);
				575
				576	extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
				577	const Py_UNICODE data, / Unicode char buffer */
				578	int length, /* Number of Py_UNICODE chars to encode */
				579	const char errors / error handling */
				580	);
				581
				582	/* --- ASCII Codecs -------------------------------------------------------
				583
				584	Only 7-bit ASCII data is excepted. All other codes generate errors.
				585
				586	*/
				587
				588	extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
				589	const char string, / ASCII encoded string */
				590	int length, /* size of string */
				591	const char errors / error handling */
				592	);
				593
				594	extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
				595	PyObject unicode / Unicode object */
				596	);
				597
				598	extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
				599	const Py_UNICODE data, / Unicode char buffer */
				600	int length, /* Number of Py_UNICODE chars to encode */
				601	const char errors / error handling */
				602	);
				603
				604	/* --- Character Map Codecs -----------------------------------------------
				605
				606	This codec uses mappings to encode and decode characters.
				607
				608	Decoding mappings must map single string characters to single
				609	Unicode characters, integers (which are then interpreted as Unicode
				610	ordinals) or None (meaning "undefined mapping" and causing an
				611	error).
				612
				613	Encoding mappings must map single Unicode characters to single
				614	string characters, integers (which are then interpreted as Latin-1
				615	ordinals) or None (meaning "undefined mapping" and causing an
				616	error).
				617
				618	If a character lookup fails with a LookupError, the character is
				619	copied as-is meaning that its ordinal value will be interpreted as
				620	Unicode or Latin-1 ordinal resp. Because of this mappings only need
				621	to contain those mappings which map characters to different code
				622	points.
				623
				624	*/
				625
				626	extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
				627	const char string, / Encoded string */
				628	int length, /* size of string */
				629	PyObject mapping, / character mapping
				630	(char ordinal -> unicode ordinal) */
				631	const char errors / error handling */
				632	);
				633
				634	extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
				635	PyObject unicode, / Unicode object */
				636	PyObject mapping / character mapping
				637	(unicode ordinal -> char ordinal) */
				638	);
				639
				640	extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
				641	const Py_UNICODE data, / Unicode char buffer */
				642	int length, /* Number of Py_UNICODE chars to encode */
				643	PyObject mapping, / character mapping
				644	(unicode ordinal -> char ordinal) */
				645	const char errors / error handling */
				646	);
				647
				648	/* Translate a Py_UNICODE buffer of the given length by applying a
				649	character mapping table to it and return the resulting Unicode
				650	object.
				651
				652	The mapping table must map Unicode ordinal integers to Unicode
				653	ordinal integers or None (causing deletion of the character).
				654
				655	Mapping tables may be dictionaries or sequences. Unmapped character
				656	ordinals (ones which cause a LookupError) are left untouched and
				657	are copied as-is.
				658
				659	*/
				660
				661	extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
				662	const Py_UNICODE data, / Unicode char buffer */
				663	int length, /* Number of Py_UNICODE chars to encode */
				664	PyObject table, / Translate table */
				665	const char errors / error handling */
				666	);
				667
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	668	#ifdef MS_WIN32
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	669
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	670	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	671
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	672	extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
				673	const char string, / MBCS encoded string */
				674	int length, /* size of string */
				675	const char errors / error handling */
				676	);
				677
				678	extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
				679	PyObject unicode / Unicode object */
				680	);
				681
				682	extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
				683	const Py_UNICODE data, / Unicode char buffer */
				684	int length, /* Number of Py_UNICODE chars to encode */
				685	const char errors / error handling */
				686	);
				687
Guido van Rossum	efec115	2000-03-28 02:01:15 +0000	[diff] [blame]	688	#endif /* MS_WIN32 */
Guido van Rossum	24bdb04	2000-03-28 20:29:59 +0000	[diff] [blame]	689
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	690	/* --- Decimal Encoder ---------------------------------------------------- */
				691
				692	/* Takes a Unicode string holding a decimal value and writes it into
				693	an output buffer using standard ASCII digit codes.
				694
				695	The output buffer has to provide at least length+1 bytes of storage
				696	area. The output string is 0-terminated.
				697
				698	The encoder converts whitespace to ' ', decimal characters to their
				699	corresponding ASCII digit and all other Latin-1 characters except
				700	\0 as-is. Characters outside this range (Unicode ordinals 1-256)
				701	are treated as errors. This includes embedded NULL bytes.
				702
				703	Error handling is defined by the errors argument:
				704
				705	NULL or "strict": raise a ValueError
				706	"ignore": ignore the wrong characters (these are not copied to the
				707	output buffer)
				708	"replace": replaces illegal characters with '?'
				709
				710	Returns 0 on success, -1 on failure.
				711
				712	*/
				713
				714	extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
				715	Py_UNICODE s, / Unicode buffer */
				716	int length, /* Number of Py_UNICODE chars to encode */
				717	char output, / Output buffer; must have size >= length */
				718	const char errors / error handling */
				719	);
				720
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	721	/* --- Methods & Slots ----------------------------------------------------
				722
				723	These are capable of handling Unicode objects and strings on input
				724	(we refer to them as strings in the descriptions) and return
				725	Unicode objects or integers as apporpriate. */
				726
				727	/* Concat two strings giving a new Unicode string. */
				728
				729	extern DL_IMPORT(PyObject*) PyUnicode_Concat(
				730	PyObject left, / Left string */
				731	PyObject right / Right string */
				732	);
				733
				734	/* Split a string giving a list of Unicode strings.
				735
				736	If sep is NULL, splitting will be done at all whitespace
				737	substrings. Otherwise, splits occur at the given separator.
				738
				739	At most maxsplit splits will be done. If negative, no limit is set.
				740
				741	Separators are not included in the resulting list.
				742
				743	*/
				744
				745	extern DL_IMPORT(PyObject*) PyUnicode_Split(
				746	PyObject s, / String to split */
				747	PyObject sep, / String separator */
				748	int maxsplit /* Maxsplit count */
				749	);
				750
				751	/* Dito, but split at line breaks.
				752
				753	CRLF is considered to be one line break. Line breaks are not
				754	included in the resulting list. */
				755
				756	extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
				757	PyObject s, / String to split */
Guido van Rossum	004d64f	2000-04-11 15:39:46 +0000	[diff] [blame]	758	int keepends /* If true, line end markers are included */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	759	);
				760
				761	/* Translate a string by applying a character mapping table to it and
				762	return the resulting Unicode object.
				763
				764	The mapping table must map Unicode ordinal integers to Unicode
				765	ordinal integers or None (causing deletion of the character).
				766
				767	Mapping tables may be dictionaries or sequences. Unmapped character
				768	ordinals (ones which cause a LookupError) are left untouched and
				769	are copied as-is.
				770
				771	*/
				772
				773	extern DL_IMPORT(PyObject *) PyUnicode_Translate(
				774	PyObject str, / String */
				775	PyObject table, / Translate table */
				776	const char errors / error handling */
				777	);
				778
				779	/* Join a sequence of strings using the given separator and return
				780	the resulting Unicode string. */
				781
				782	extern DL_IMPORT(PyObject*) PyUnicode_Join(
				783	PyObject separator, / Separator string */
				784	PyObject seq / Sequence object */
				785	);
				786
				787	/* Return 1 if substr matches str[start:end] at the given tail end, 0
				788	otherwise. */
				789
				790	extern DL_IMPORT(int) PyUnicode_Tailmatch(
				791	PyObject str, / String */
				792	PyObject substr, / Prefix or Suffix string */
				793	int start, /* Start index */
				794	int end, /* Stop index */
				795	int direction /* Tail end: -1 prefix, +1 suffix */
				796	);
				797
				798	/* Return the first position of substr in str[start:end] using the
				799	given search direction or -1 if not found. */
				800
				801	extern DL_IMPORT(int) PyUnicode_Find(
				802	PyObject str, / String */
				803	PyObject substr, / Substring to find */
				804	int start, /* Start index */
				805	int end, /* Stop index */
				806	int direction /* Find direction: +1 forward, -1 backward */
				807	);
				808
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	809	/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	810
				811	extern DL_IMPORT(int) PyUnicode_Count(
				812	PyObject str, / String */
				813	PyObject substr, / Substring to count */
				814	int start, /* Start index */
				815	int end /* Stop index */
				816	);
				817
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	818	/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	819	and return the resulting Unicode object. */
				820
				821	extern DL_IMPORT(PyObject *) PyUnicode_Replace(
				822	PyObject str, / String */
				823	PyObject substr, / Substring to find */
				824	PyObject replstr, / Substring to replace */
				825	int maxcount /* Max. number of replacements to apply;
				826	-1 = all */
				827	);
				828
				829	/* Compare two strings and return -1, 0, 1 for less than, equal,
				830	greater than resp. */
				831
				832	extern DL_IMPORT(int) PyUnicode_Compare(
				833	PyObject left, / Left string */
				834	PyObject right / Right string */
				835	);
				836
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	837	/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	838	the resulting Unicode string. */
				839
				840	extern DL_IMPORT(PyObject *) PyUnicode_Format(
				841	PyObject format, / Format string */
				842	PyObject args / Argument tuple or dictionary */
				843	);
				844
Guido van Rossum	d0d366b	2000-03-13 23:22:24 +0000	[diff] [blame]	845	/* Checks whether element is contained in container and return 1/0
				846	accordingly.
				847
				848	element has to coerce to an one element Unicode string. -1 is
				849	returned in case of an error. */
				850
				851	extern DL_IMPORT(int) PyUnicode_Contains(
				852	PyObject container, / Container string */
				853	PyObject element / Element string */
				854	);
				855
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	856	/* === Characters Type APIs =============================================== */
				857
				858	/* These should not be used directly. Use the Py_UNICODE_IS* and
				859	Py_UNICODE_TO* macros instead.
				860
				861	These APIs are implemented in Objects/unicodectype.c.
				862
				863	*/
				864
				865	extern DL_IMPORT(int) _PyUnicode_IsLowercase(
				866	register const Py_UNICODE ch /* Unicode character */
				867	);
				868
				869	extern DL_IMPORT(int) _PyUnicode_IsUppercase(
				870	register const Py_UNICODE ch /* Unicode character */
				871	);
				872
				873	extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
				874	register const Py_UNICODE ch /* Unicode character */
				875	);
				876
				877	extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
				878	register const Py_UNICODE ch /* Unicode character */
				879	);
				880
				881	extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
				882	register const Py_UNICODE ch /* Unicode character */
				883	);
				884
				885	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
				886	register const Py_UNICODE ch /* Unicode character */
				887	);
				888
				889	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
				890	register const Py_UNICODE ch /* Unicode character */
				891	);
				892
				893	extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
				894	register const Py_UNICODE ch /* Unicode character */
				895	);
				896
				897	extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
				898	register const Py_UNICODE ch /* Unicode character */
				899	);
				900
				901	extern DL_IMPORT(int) _PyUnicode_ToDigit(
				902	register const Py_UNICODE ch /* Unicode character */
				903	);
				904
				905	extern DL_IMPORT(double) _PyUnicode_ToNumeric(
				906	register const Py_UNICODE ch /* Unicode character */
				907	);
				908
				909	extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
				910	register const Py_UNICODE ch /* Unicode character */
				911	);
				912
				913	extern DL_IMPORT(int) _PyUnicode_IsDigit(
				914	register const Py_UNICODE ch /* Unicode character */
				915	);
				916
				917	extern DL_IMPORT(int) _PyUnicode_IsNumeric(
				918	register const Py_UNICODE ch /* Unicode character */
				919	);
				920
Marc-André Lemburg	f03e741	2000-07-05 09:45:59 +0000	[diff] [blame]	921	extern DL_IMPORT(int) _PyUnicode_IsAlpha(
				922	register const Py_UNICODE ch /* Unicode character */
				923	);
				924
Guido van Rossum	d822518	2000-03-10 22:33:05 +0000	[diff] [blame]	925	#ifdef __cplusplus
				926	}
				927	#endif
				928	#endif /* !Py_UNICODEOBJECT_H */