Blame - Include/cpython/unicodeobject.h - platform/external/python/cpython3

blob: 88a97a4cb5f71f1e47a2dfbd52d6bafd7d0b0497 [file] [log] [blame]

Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1	#ifndef Py_CPYTHON_UNICODEOBJECT_H
				2	# error "this header file must not be included directly"
				3	#endif
				4
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	5	/* Py_UNICODE was the native Unicode storage format (code unit) used by
				6	Python and represents a single Unicode element in the Unicode type.
				7	With PEP 393, Py_UNICODE is deprecated and replaced with a
				8	typedef to wchar_t. */
				9	#define PY_UNICODE_TYPE wchar_t
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	10	/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	11
				12	/* --- Internal Unicode Operations ---------------------------------------- */
				13
				14	/* Since splitting on whitespace is an important use case, and
				15	whitespace in most situations is solely ASCII whitespace, we
				16	optimize for the common case by using a quick look-up table
				17	_Py_ascii_whitespace (see below) with an inlined check.
				18
				19	*/
				20	#define Py_UNICODE_ISSPACE(ch) \
				21	((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
				22
				23	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
				24	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
				25	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
				26	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
				27
				28	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
				29	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
				30	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
				31
				32	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
				33	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
				34	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
				35	#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
				36
				37	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
				38	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
				39	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
				40
				41	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
				42
				43	#define Py_UNICODE_ISALNUM(ch) \
				44	(Py_UNICODE_ISALPHA(ch) \|\| \
				45	Py_UNICODE_ISDECIMAL(ch) \|\| \
				46	Py_UNICODE_ISDIGIT(ch) \|\| \
				47	Py_UNICODE_ISNUMERIC(ch))
				48
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	49	Py_DEPRECATED(3.3) static inline void
				50	Py_UNICODE_COPY(Py_UNICODE target, const Py_UNICODE source, Py_ssize_t length) {
				51	memcpy(target, source, length * sizeof(Py_UNICODE));
				52	}
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	53
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	54	Py_DEPRECATED(3.3) static inline void
				55	Py_UNICODE_FILL(Py_UNICODE *target, Py_UNICODE value, Py_ssize_t length) {
Inada Naoki	8e34e92	2020-06-17 23:43:01 +0900	[diff] [blame]	56	Py_ssize_t i;
				57	for (i = 0; i < length; i++) {
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	58	target[i] = value;
				59	}
				60	}
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	61
				62	/* macros to work with surrogates */
				63	#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
				64	#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
				65	#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
				66	/* Join two surrogate characters and return a single Py_UCS4 value. */
				67	#define Py_UNICODE_JOIN_SURROGATES(high, low) \
				68	(((((Py_UCS4)(high) & 0x03FF) << 10) \| \
				69	((Py_UCS4)(low) & 0x03FF)) + 0x10000)
				70	/* high surrogate = top 10 bits added to D800 */
				71	#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
				72	/* low surrogate = bottom 10 bits added to DC00 */
				73	#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
				74
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	75	/* --- Unicode Type ------------------------------------------------------- */
				76
				77	/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
				78	structure. state.ascii and state.compact are set, and the data
				79	immediately follow the structure. utf8_length and wstr_length can be found
				80	in the length field; the utf8 pointer is equal to the data pointer. */
				81	typedef struct {
				82	/* There are 4 forms of Unicode strings:
				83
				84	- compact ascii:
				85
				86	* structure = PyASCIIObject
				87	* test: PyUnicode_IS_COMPACT_ASCII(op)
				88	* kind = PyUnicode_1BYTE_KIND
				89	* compact = 1
				90	* ascii = 1
				91	* ready = 1
				92	* (length is the length of the utf8 and wstr strings)
				93	* (data starts just after the structure)
				94	* (since ASCII is decoded from UTF-8, the utf8 string are the data)
				95
				96	- compact:
				97
				98	* structure = PyCompactUnicodeObject
				99	* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
				100	* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
				101	PyUnicode_4BYTE_KIND
				102	* compact = 1
				103	* ready = 1
				104	* ascii = 0
				105	* utf8 is not shared with data
				106	* utf8_length = 0 if utf8 is NULL
				107	* wstr is shared with data and wstr_length=length
				108	if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
				109	or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
				110	* wstr_length = 0 if wstr is NULL
				111	* (data starts just after the structure)
				112
				113	- legacy string, not ready:
				114
				115	* structure = PyUnicodeObject
				116	* test: kind == PyUnicode_WCHAR_KIND
				117	* length = 0 (use wstr_length)
				118	* hash = -1
				119	* kind = PyUnicode_WCHAR_KIND
				120	* compact = 0
				121	* ascii = 0
				122	* ready = 0
				123	* interned = SSTATE_NOT_INTERNED
				124	* wstr is not NULL
				125	* data.any is NULL
				126	* utf8 is NULL
				127	* utf8_length = 0
				128
				129	- legacy string, ready:
				130
				131	* structure = PyUnicodeObject structure
				132	* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
				133	* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
				134	PyUnicode_4BYTE_KIND
				135	* compact = 0
				136	* ready = 1
				137	* data.any is not NULL
				138	* utf8 is shared and utf8_length = length with data.any if ascii = 1
				139	* utf8_length = 0 if utf8 is NULL
				140	* wstr is shared with data.any and wstr_length = length
				141	if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
				142	or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
				143	* wstr_length = 0 if wstr is NULL
				144
				145	Compact strings use only one memory block (structure + characters),
				146	whereas legacy strings use one block for the structure and one block
				147	for characters.
				148
				149	Legacy strings are created by PyUnicode_FromUnicode() and
				150	PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
				151	when PyUnicode_READY() is called.
				152
				153	See also _PyUnicode_CheckConsistency().
				154	*/
				155	PyObject_HEAD
				156	Py_ssize_t length; /* Number of code points in the string */
				157	Py_hash_t hash; /* Hash value; -1 if not set */
				158	struct {
				159	/*
				160	SSTATE_NOT_INTERNED (0)
				161	SSTATE_INTERNED_MORTAL (1)
				162	SSTATE_INTERNED_IMMORTAL (2)
				163
				164	If interned != SSTATE_NOT_INTERNED, the two references from the
				165	dictionary to this object are not counted in ob_refcnt.
				166	*/
				167	unsigned int interned:2;
				168	/* Character size:
				169
				170	- PyUnicode_WCHAR_KIND (0):
				171
				172	* character type = wchar_t (16 or 32 bits, depending on the
				173	platform)
				174
				175	- PyUnicode_1BYTE_KIND (1):
				176
				177	* character type = Py_UCS1 (8 bits, unsigned)
				178	* all characters are in the range U+0000-U+00FF (latin1)
				179	* if ascii is set, all characters are in the range U+0000-U+007F
				180	(ASCII), otherwise at least one character is in the range
				181	U+0080-U+00FF
				182
				183	- PyUnicode_2BYTE_KIND (2):
				184
				185	* character type = Py_UCS2 (16 bits, unsigned)
				186	* all characters are in the range U+0000-U+FFFF (BMP)
				187	* at least one character is in the range U+0100-U+FFFF
				188
				189	- PyUnicode_4BYTE_KIND (4):
				190
				191	* character type = Py_UCS4 (32 bits, unsigned)
				192	* all characters are in the range U+0000-U+10FFFF
				193	* at least one character is in the range U+10000-U+10FFFF
				194	*/
				195	unsigned int kind:3;
				196	/* Compact is with respect to the allocation scheme. Compact unicode
				197	objects only require one memory block while non-compact objects use
				198	one block for the PyUnicodeObject struct and another for its data
				199	buffer. */
				200	unsigned int compact:1;
				201	/* The string only contains characters in the range U+0000-U+007F (ASCII)
				202	and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
				203	set, use the PyASCIIObject structure. */
				204	unsigned int ascii:1;
				205	/* The ready flag indicates whether the object layout is initialized
				206	completely. This means that this is either a compact object, or
				207	the data pointer is filled out. The bit is redundant, and helps
				208	to minimize the test in PyUnicode_IS_READY(). */
				209	unsigned int ready:1;
				210	/* Padding to ensure that PyUnicode_DATA() is always aligned to
				211	4 bytes (see issue #19537 on m68k). */
				212	unsigned int :24;
				213	} state;
				214	wchar_t wstr; / wchar_t representation (null-terminated) */
				215	} PyASCIIObject;
				216
				217	/* Non-ASCII strings allocated through PyUnicode_New use the
				218	PyCompactUnicodeObject structure. state.compact is set, and the data
				219	immediately follow the structure. */
				220	typedef struct {
				221	PyASCIIObject _base;
				222	Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
				223	* terminating \0. */
				224	char utf8; / UTF-8 representation (null-terminated) */
				225	Py_ssize_t wstr_length; /* Number of code points in wstr, possible
				226	* surrogates count as two code points. */
				227	} PyCompactUnicodeObject;
				228
				229	/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
				230	PyUnicodeObject structure. The actual string data is initially in the wstr
				231	block, and copied into the data block using _PyUnicode_Ready. */
				232	typedef struct {
				233	PyCompactUnicodeObject _base;
				234	union {
				235	void *any;
				236	Py_UCS1 *latin1;
				237	Py_UCS2 *ucs2;
				238	Py_UCS4 *ucs4;
				239	} data; /* Canonical, smallest-form Unicode buffer */
				240	} PyUnicodeObject;
				241
Victor Stinner	6876257	2019-10-07 18:42:01 +0200	[diff] [blame]	242	PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
				243	PyObject *op,
				244	int check_content);
				245
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	246	/* Fast access macros */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	247
				248	/* Returns the deprecated Py_UNICODE representation's size in code units
				249	(this includes surrogate pairs as 2 units).
				250	If the Py_UNICODE representation is not available, it will be computed
				251	on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
				252
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	253	/* Py_DEPRECATED(3.3) */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	254	#define PyUnicode_GET_SIZE(op) \
				255	(assert(PyUnicode_Check(op)), \
				256	(((PyASCIIObject *)(op))->wstr) ? \
				257	PyUnicode_WSTR_LENGTH(op) : \
				258	((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
				259	assert(((PyASCIIObject *)(op))->wstr), \
				260	PyUnicode_WSTR_LENGTH(op)))
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	261
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	262	/* Py_DEPRECATED(3.3) */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	263	#define PyUnicode_GET_DATA_SIZE(op) \
				264	(PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	265
				266	/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
				267	representation on demand. Using this macro is very inefficient now,
				268	try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
				269	use PyUnicode_WRITE() and PyUnicode_READ(). */
				270
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	271	/* Py_DEPRECATED(3.3) */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	272	#define PyUnicode_AS_UNICODE(op) \
				273	(assert(PyUnicode_Check(op)), \
				274	(((PyASCIIObject )(op))->wstr) ? (((PyASCIIObject )(op))->wstr) : \
				275	PyUnicode_AsUnicode(_PyObject_CAST(op)))
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	276
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	277	/* Py_DEPRECATED(3.3) */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	278	#define PyUnicode_AS_DATA(op) \
				279	((const char *)(PyUnicode_AS_UNICODE(op)))
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	280
				281
				282	/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
				283
				284	/* Values for PyASCIIObject.state: */
				285
				286	/* Interning state. */
				287	#define SSTATE_NOT_INTERNED 0
				288	#define SSTATE_INTERNED_MORTAL 1
				289	#define SSTATE_INTERNED_IMMORTAL 2
				290
				291	/* Return true if the string contains only ASCII characters, or 0 if not. The
				292	string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
				293	ready. */
				294	#define PyUnicode_IS_ASCII(op) \
				295	(assert(PyUnicode_Check(op)), \
				296	assert(PyUnicode_IS_READY(op)), \
				297	((PyASCIIObject*)op)->state.ascii)
				298
				299	/* Return true if the string is compact or 0 if not.
				300	No type checks or Ready calls are performed. */
				301	#define PyUnicode_IS_COMPACT(op) \
				302	(((PyASCIIObject*)(op))->state.compact)
				303
				304	/* Return true if the string is a compact ASCII string (use PyASCIIObject
				305	structure), or 0 if not. No type checks or Ready calls are performed. */
				306	#define PyUnicode_IS_COMPACT_ASCII(op) \
				307	(((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
				308
				309	enum PyUnicode_Kind {
				310	/* String contains only wstr byte characters. This is only possible
				311	when the string was created with a legacy API and _PyUnicode_Ready()
				312	has not been called yet. */
				313	PyUnicode_WCHAR_KIND = 0,
				314	/* Return values of the PyUnicode_KIND() macro: */
				315	PyUnicode_1BYTE_KIND = 1,
				316	PyUnicode_2BYTE_KIND = 2,
				317	PyUnicode_4BYTE_KIND = 4
				318	};
				319
				320	/* Return pointers to the canonical representation cast to unsigned char,
				321	Py_UCS2, or Py_UCS4 for direct character access.
				322	No checks are performed, use PyUnicode_KIND() before to ensure
				323	these will work correctly. */
				324
				325	#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
				326	#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
				327	#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
				328
				329	/* Return one of the PyUnicode__KIND values defined above. /
				330	#define PyUnicode_KIND(op) \
				331	(assert(PyUnicode_Check(op)), \
				332	assert(PyUnicode_IS_READY(op)), \
				333	((PyASCIIObject *)(op))->state.kind)
				334
				335	/* Return a void pointer to the raw unicode buffer. */
				336	#define _PyUnicode_COMPACT_DATA(op) \
				337	(PyUnicode_IS_ASCII(op) ? \
				338	((void)((PyASCIIObject)(op) + 1)) : \
				339	((void)((PyCompactUnicodeObject)(op) + 1)))
				340
				341	#define _PyUnicode_NONCOMPACT_DATA(op) \
				342	(assert(((PyUnicodeObject*)(op))->data.any), \
				343	((((PyUnicodeObject *)(op))->data.any)))
				344
				345	#define PyUnicode_DATA(op) \
				346	(assert(PyUnicode_Check(op)), \
				347	PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
				348	_PyUnicode_NONCOMPACT_DATA(op))
				349
				350	/* In the access macros below, "kind" may be evaluated more than once.
				351	All other macro parameters are evaluated exactly once, so it is safe
				352	to put side effects into them (such as increasing the index). */
				353
				354	/* Write into the canonical representation, this macro does not do any sanity
				355	checks and is intended for usage in loops. The caller should cache the
				356	kind and data pointers obtained from other macro calls.
				357	index is the index in the string (starts at 0) and value is the new
				358	code point value which should be written to that location. */
				359	#define PyUnicode_WRITE(kind, data, index, value) \
				360	do { \
				361	switch ((kind)) { \
				362	case PyUnicode_1BYTE_KIND: { \
				363	((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
				364	break; \
				365	} \
				366	case PyUnicode_2BYTE_KIND: { \
				367	((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
				368	break; \
				369	} \
				370	default: { \
				371	assert((kind) == PyUnicode_4BYTE_KIND); \
				372	((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
				373	} \
				374	} \
				375	} while (0)
				376
				377	/* Read a code point from the string's canonical representation. No checks
				378	or ready calls are performed. */
				379	#define PyUnicode_READ(kind, data, index) \
				380	((Py_UCS4) \
				381	((kind) == PyUnicode_1BYTE_KIND ? \
				382	((const Py_UCS1 *)(data))[(index)] : \
				383	((kind) == PyUnicode_2BYTE_KIND ? \
				384	((const Py_UCS2 *)(data))[(index)] : \
				385	((const Py_UCS4 *)(data))[(index)] \
				386	) \
				387	))
				388
				389	/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
				390	calls PyUnicode_KIND() and might call it twice. For single reads, use
				391	PyUnicode_READ_CHAR, for multiple consecutive reads callers should
				392	cache kind and use PyUnicode_READ instead. */
				393	#define PyUnicode_READ_CHAR(unicode, index) \
				394	(assert(PyUnicode_Check(unicode)), \
				395	assert(PyUnicode_IS_READY(unicode)), \
				396	(Py_UCS4) \
				397	(PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
				398	((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
				399	(PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
				400	((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
				401	((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
				402	) \
				403	))
				404
				405	/* Returns the length of the unicode string. The caller has to make sure that
				406	the string has it's canonical representation set before calling
				407	this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
				408	#define PyUnicode_GET_LENGTH(op) \
				409	(assert(PyUnicode_Check(op)), \
				410	assert(PyUnicode_IS_READY(op)), \
				411	((PyASCIIObject *)(op))->length)
				412
				413
				414	/* Fast check to determine whether an object is ready. Equivalent to
				415	PyUnicode_IS_COMPACT(op) \|\| ((PyUnicodeObject)(op))->data.any) /
				416
				417	#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
				418
				419	/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
				420	case. If the canonical representation is not yet set, it will still call
				421	_PyUnicode_Ready().
				422	Returns 0 on success and -1 on errors. */
				423	#define PyUnicode_READY(op) \
				424	(assert(PyUnicode_Check(op)), \
				425	(PyUnicode_IS_READY(op) ? \
				426	0 : _PyUnicode_Ready(_PyObject_CAST(op))))
				427
				428	/* Return a maximum character value which is suitable for creating another
				429	string based on op. This is always an approximation but more efficient
				430	than iterating over the string. */
				431	#define PyUnicode_MAX_CHAR_VALUE(op) \
				432	(assert(PyUnicode_IS_READY(op)), \
				433	(PyUnicode_IS_ASCII(op) ? \
				434	(0x7f) : \
				435	(PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
				436	(0xffU) : \
				437	(PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
				438	(0xffffU) : \
				439	(0x10ffffU)))))
				440
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	441	Py_DEPRECATED(3.3)
				442	static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {
				443	return PyUnicode_IS_COMPACT_ASCII(op) ?
				444	((PyASCIIObject*)op)->length :
				445	((PyCompactUnicodeObject*)op)->wstr_length;
				446	}
				447	#define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)
				448
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	449	/* === Public API ========================================================= */
				450
				451	/* --- Plain Py_UNICODE --------------------------------------------------- */
				452
				453	/* With PEP 393, this is the recommended way to allocate a new unicode object.
				454	This function will allocate the object and its buffer in a single memory
				455	block. Objects created using this function are not resizable. */
				456	PyAPI_FUNC(PyObject*) PyUnicode_New(
				457	Py_ssize_t size, /* Number of code points in the new string */
				458	Py_UCS4 maxchar /* maximum code point value in the string */
				459	);
				460
				461	/* Initializes the canonical string representation from the deprecated
				462	wstr/Py_UNICODE representation. This function is used to convert Unicode
				463	objects which were created using the old API to the new flexible format
				464	introduced with PEP 393.
				465
				466	Don't call this function directly, use the public PyUnicode_READY() macro
				467	instead. */
				468	PyAPI_FUNC(int) _PyUnicode_Ready(
				469	PyObject unicode / Unicode object */
				470	);
				471
				472	/* Get a copy of a Unicode string. */
				473	PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
				474	PyObject *unicode
				475	);
				476
				477	/* Copy character from one unicode object into another, this function performs
				478	character conversion when necessary and falls back to memcpy() if possible.
				479
				480	Fail if to is too small (smaller than how_many or smaller than
				481	len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
				482	kind(to), or if to has more than 1 reference.
				483
				484	Return the number of written character, or return -1 and raise an exception
				485	on error.
				486
				487	Pseudo-code:
				488
				489	how_many = min(how_many, len(from) - from_start)
				490	to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
				491	return how_many
				492
				493	Note: The function doesn't write a terminating null character.
				494	*/
				495	PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
				496	PyObject *to,
				497	Py_ssize_t to_start,
				498	PyObject *from,
				499	Py_ssize_t from_start,
				500	Py_ssize_t how_many
				501	);
				502
				503	/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
				504	may crash if parameters are invalid (e.g. if the output string
				505	is too short). */
				506	PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
				507	PyObject *to,
				508	Py_ssize_t to_start,
				509	PyObject *from,
				510	Py_ssize_t from_start,
				511	Py_ssize_t how_many
				512	);
				513
				514	/* Fill a string with a character: write fill_char into
				515	unicode[start:start+length].
				516
				517	Fail if fill_char is bigger than the string maximum character, or if the
				518	string has more than 1 reference.
				519
				520	Return the number of written character, or return -1 and raise an exception
				521	on error. */
				522	PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
				523	PyObject *unicode,
				524	Py_ssize_t start,
				525	Py_ssize_t length,
				526	Py_UCS4 fill_char
				527	);
				528
				529	/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
				530	if parameters are invalid (e.g. if length is longer than the string). */
				531	PyAPI_FUNC(void) _PyUnicode_FastFill(
				532	PyObject *unicode,
				533	Py_ssize_t start,
				534	Py_ssize_t length,
				535	Py_UCS4 fill_char
				536	);
				537
				538	/* Create a Unicode Object from the Py_UNICODE buffer u of the given
				539	size.
				540
				541	u may be NULL which causes the contents to be undefined. It is the
				542	user's responsibility to fill in the needed data afterwards. Note
				543	that modifying the Unicode object contents after construction is
				544	only allowed if u was set to NULL.
				545
				546	The buffer is copied into the new object. */
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	547	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	548	const Py_UNICODE u, / Unicode buffer */
				549	Py_ssize_t size /* size of buffer */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	550	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	551
				552	/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
				553	Scan the string to find the maximum character. */
				554	PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
				555	int kind,
				556	const void *buffer,
				557	Py_ssize_t size);
				558
				559	/* Create a new string from a buffer of ASCII characters.
				560	WARNING: Don't check if the string contains any non-ASCII character. */
				561	PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
				562	const char *buffer,
				563	Py_ssize_t size);
				564
				565	/* Compute the maximum character of the substring unicode[start:end].
				566	Return 127 for an empty string. */
				567	PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
				568	PyObject *unicode,
				569	Py_ssize_t start,
				570	Py_ssize_t end);
				571
				572	/* Return a read-only pointer to the Unicode object's internal
				573	Py_UNICODE buffer.
				574	If the wchar_t/Py_UNICODE representation is not yet available, this
				575	function will calculate it. */
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	576	Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	577	PyObject unicode / Unicode object */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	578	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	579
				580	/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
				581	contains null characters. */
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	582	Py_DEPRECATED(3.3) PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	583	PyObject unicode / Unicode object */
				584	);
				585
				586	/* Return a read-only pointer to the Unicode object's internal
				587	Py_UNICODE buffer and save the length at size.
				588	If the wchar_t/Py_UNICODE representation is not yet available, this
				589	function will calculate it. */
				590
Inada Naoki	2c4928d	2020-06-17 20:09:44 +0900	[diff] [blame]	591	Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	592	PyObject unicode, / Unicode object */
				593	Py_ssize_t size / location where to save the length */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	594	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	595
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	596
				597	/* --- _PyUnicodeWriter API ----------------------------------------------- */
				598
				599	typedef struct {
				600	PyObject *buffer;
				601	void *data;
				602	enum PyUnicode_Kind kind;
				603	Py_UCS4 maxchar;
				604	Py_ssize_t size;
				605	Py_ssize_t pos;
				606
				607	/* minimum number of allocated characters (default: 0) */
				608	Py_ssize_t min_length;
				609
				610	/* minimum character (default: 127, ASCII) */
				611	Py_UCS4 min_char;
				612
				613	/* If non-zero, overallocate the buffer (default: 0). */
				614	unsigned char overallocate;
				615
				616	/* If readonly is 1, buffer is a shared string (cannot be modified)
				617	and size is set to 0. */
				618	unsigned char readonly;
				619	} _PyUnicodeWriter ;
				620
				621	/* Initialize a Unicode writer.
				622	*
				623	* By default, the minimum buffer size is 0 character and overallocation is
				624	* disabled. Set min_length, min_char and overallocate attributes to control
				625	* the allocation of the buffer. */
				626	PyAPI_FUNC(void)
				627	_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
				628
				629	/* Prepare the buffer to write 'length' characters
				630	with the specified maximum character.
				631
				632	Return 0 on success, raise an exception and return -1 on error. */
				633	#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
				634	(((MAXCHAR) <= (WRITER)->maxchar \
				635	&& (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
				636	? 0 \
				637	: (((LENGTH) == 0) \
				638	? 0 \
				639	: _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
				640
				641	/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
				642	instead. */
				643	PyAPI_FUNC(int)
				644	_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
				645	Py_ssize_t length, Py_UCS4 maxchar);
				646
				647	/* Prepare the buffer to have at least the kind KIND.
				648	For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
				649	support characters in range U+000-U+FFFF.
				650
				651	Return 0 on success, raise an exception and return -1 on error. */
				652	#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
				653	(assert((KIND) != PyUnicode_WCHAR_KIND), \
				654	(KIND) <= (WRITER)->kind \
				655	? 0 \
				656	: _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
				657
				658	/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
				659	macro instead. */
				660	PyAPI_FUNC(int)
				661	_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
				662	enum PyUnicode_Kind kind);
				663
				664	/* Append a Unicode character.
				665	Return 0 on success, raise an exception and return -1 on error. */
				666	PyAPI_FUNC(int)
				667	_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
				668	Py_UCS4 ch
				669	);
				670
				671	/* Append a Unicode string.
				672	Return 0 on success, raise an exception and return -1 on error. */
				673	PyAPI_FUNC(int)
				674	_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
				675	PyObject str / Unicode string */
				676	);
				677
				678	/* Append a substring of a Unicode string.
				679	Return 0 on success, raise an exception and return -1 on error. */
				680	PyAPI_FUNC(int)
				681	_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
				682	PyObject str, / Unicode string */
				683	Py_ssize_t start,
				684	Py_ssize_t end
				685	);
				686
				687	/* Append an ASCII-encoded byte string.
				688	Return 0 on success, raise an exception and return -1 on error. */
				689	PyAPI_FUNC(int)
				690	_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
				691	const char str, / ASCII-encoded byte string */
				692	Py_ssize_t len /* number of bytes, or -1 if unknown */
				693	);
				694
				695	/* Append a latin1-encoded byte string.
				696	Return 0 on success, raise an exception and return -1 on error. */
				697	PyAPI_FUNC(int)
				698	_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
				699	const char str, / latin1-encoded byte string */
				700	Py_ssize_t len /* length in bytes */
				701	);
				702
				703	/* Get the value of the writer as a Unicode string. Clear the
				704	buffer of the writer. Raise an exception and return NULL
				705	on error. */
				706	PyAPI_FUNC(PyObject *)
				707	_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
				708
				709	/* Deallocate memory of a writer (clear its internal buffer). */
				710	PyAPI_FUNC(void)
				711	_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
				712
				713
				714	/* Format the object based on the format_spec, as defined in PEP 3101
				715	(Advanced String Formatting). */
				716	PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
				717	_PyUnicodeWriter *writer,
				718	PyObject *obj,
				719	PyObject *format_spec,
				720	Py_ssize_t start,
				721	Py_ssize_t end);
				722
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	723	/* --- Manage the default encoding ---------------------------------------- */
				724
				725	/* Returns a pointer to the default encoding (UTF-8) of the
				726	Unicode object unicode and the size of the encoded representation
				727	in bytes stored in *size.
				728
				729	In case of an error, no *size is set.
				730
				731	This function caches the UTF-8 encoded string in the unicodeobject
				732	and subsequent calls will return the same string. The memory is released
				733	when the unicodeobject is deallocated.
				734
				735	_PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
				736	support the previous internal function with the same behaviour.
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	737	*/
				738
				739	PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
				740	PyObject *unicode,
				741	Py_ssize_t *size);
				742
				743	#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
				744
				745	/* Returns a pointer to the default encoding (UTF-8) of the
				746	Unicode object unicode.
				747
				748	Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
				749	in the unicodeobject.
				750
				751	_PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
				752	support the previous internal function with the same behaviour.
				753
				754	Use of this API is DEPRECATED since no size information can be
				755	extracted from the returned data.
				756
				757	*** This API is for interpreter INTERNAL USE ONLY and will likely
				758	*** be removed or changed for Python 3.1.
				759
				760	*** If you need to access the Unicode object as UTF-8 bytes string,
				761	*** please use PyUnicode_AsUTF8String() instead.
				762
				763	*/
				764
				765	PyAPI_FUNC(const char ) PyUnicode_AsUTF8(PyObject unicode);
				766
				767	#define _PyUnicode_AsString PyUnicode_AsUTF8
				768
				769	/* --- Generic Codecs ----------------------------------------------------- */
				770
				771	/* Encodes a Py_UNICODE buffer of the given size and returns a
				772	Python string object. */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	773	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	774	const Py_UNICODE s, / Unicode char buffer */
				775	Py_ssize_t size, /* number of Py_UNICODE chars to encode */
				776	const char encoding, / encoding */
				777	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	778	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	779
				780	/* --- UTF-7 Codecs ------------------------------------------------------- */
				781
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	782	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	783	const Py_UNICODE data, / Unicode char buffer */
				784	Py_ssize_t length, /* number of Py_UNICODE chars to encode */
				785	int base64SetO, /* Encode RFC2152 Set O characters in base64 */
				786	int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
				787	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	788	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	789
				790	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
				791	PyObject unicode, / Unicode object */
				792	int base64SetO, /* Encode RFC2152 Set O characters in base64 */
				793	int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
				794	const char errors / error handling */
				795	);
				796
				797	/* --- UTF-8 Codecs ------------------------------------------------------- */
				798
				799	PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
				800	PyObject *unicode,
				801	const char *errors);
				802
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	803	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	804	const Py_UNICODE data, / Unicode char buffer */
				805	Py_ssize_t length, /* number of Py_UNICODE chars to encode */
				806	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	807	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	808
				809	/* --- UTF-32 Codecs ------------------------------------------------------ */
				810
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	811	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	812	const Py_UNICODE data, / Unicode char buffer */
				813	Py_ssize_t length, /* number of Py_UNICODE chars to encode */
				814	const char errors, / error handling */
				815	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	816	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	817
				818	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
				819	PyObject object, / Unicode object */
				820	const char errors, / error handling */
				821	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				822	);
				823
				824	/* --- UTF-16 Codecs ------------------------------------------------------ */
				825
				826	/* Returns a Python string object holding the UTF-16 encoded value of
				827	the Unicode data.
				828
				829	If byteorder is not 0, output is written according to the following
				830	byte order:
				831
				832	byteorder == -1: little endian
				833	byteorder == 0: native byte order (writes a BOM mark)
				834	byteorder == 1: big endian
				835
				836	If byteorder is 0, the output string will always start with the
				837	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
				838	prepended.
				839
				840	Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
				841	UCS-2. This trick makes it possible to add full UTF-16 capabilities
				842	at a later point without compromising the APIs.
				843
				844	*/
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	845	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	846	const Py_UNICODE data, / Unicode char buffer */
				847	Py_ssize_t length, /* number of Py_UNICODE chars to encode */
				848	const char errors, / error handling */
				849	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	850	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	851
				852	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
				853	PyObject* unicode, /* Unicode object */
				854	const char errors, / error handling */
				855	int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
				856	);
				857
				858	/* --- Unicode-Escape Codecs ---------------------------------------------- */
				859
				860	/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
				861	chars. */
				862	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
				863	const char string, / Unicode-Escape encoded string */
				864	Py_ssize_t length, /* size of string */
				865	const char errors, / error handling */
				866	const char *first_invalid_escape / on return, points to first
				867	invalid escaped char in
				868	string. */
				869	);
				870
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	871	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	872	const Py_UNICODE data, / Unicode char buffer */
				873	Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	874	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	875
				876	/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
				877
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	878	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	879	const Py_UNICODE data, / Unicode char buffer */
				880	Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	881	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	882
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	883	/* --- Latin-1 Codecs ----------------------------------------------------- */
				884
				885	PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
				886	PyObject* unicode,
				887	const char* errors);
				888
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	889	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	890	const Py_UNICODE data, / Unicode char buffer */
				891	Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
				892	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	893	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	894
				895	/* --- ASCII Codecs ------------------------------------------------------- */
				896
				897	PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
				898	PyObject* unicode,
				899	const char* errors);
				900
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	901	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	902	const Py_UNICODE data, / Unicode char buffer */
				903	Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
				904	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	905	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	906
				907	/* --- Character Map Codecs ----------------------------------------------- */
				908
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	909	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	910	const Py_UNICODE data, / Unicode char buffer */
				911	Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
				912	PyObject mapping, / encoding mapping */
				913	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	914	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	915
				916	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
				917	PyObject unicode, / Unicode object */
				918	PyObject mapping, / encoding mapping */
				919	const char errors / error handling */
				920	);
				921
				922	/* Translate a Py_UNICODE buffer of the given length by applying a
				923	character mapping table to it and return the resulting Unicode
				924	object.
				925
				926	The mapping table must map Unicode ordinal integers to Unicode strings,
				927	Unicode ordinal integers or None (causing deletion of the character).
				928
				929	Mapping tables may be dictionaries or sequences. Unmapped character
				930	ordinals (ones which cause a LookupError) are left untouched and
				931	are copied as-is.
				932
				933	*/
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	934	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	935	const Py_UNICODE data, / Unicode char buffer */
				936	Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
				937	PyObject table, / Translate table */
				938	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	939	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	940
				941	/* --- MBCS codecs for Windows -------------------------------------------- */
				942
				943	#ifdef MS_WINDOWS
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	944	Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	945	const Py_UNICODE data, / Unicode char buffer */
				946	Py_ssize_t length, /* number of Py_UNICODE chars to encode */
				947	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	948	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	949	#endif
				950
				951	/* --- Decimal Encoder ---------------------------------------------------- */
				952
				953	/* Takes a Unicode string holding a decimal value and writes it into
				954	an output buffer using standard ASCII digit codes.
				955
				956	The output buffer has to provide at least length+1 bytes of storage
				957	area. The output string is 0-terminated.
				958
				959	The encoder converts whitespace to ' ', decimal characters to their
				960	corresponding ASCII digit and all other Latin-1 characters except
				961	\0 as-is. Characters outside this range (Unicode ordinals 1-256)
				962	are treated as errors. This includes embedded NULL bytes.
				963
				964	Error handling is defined by the errors argument:
				965
				966	NULL or "strict": raise a ValueError
				967	"ignore": ignore the wrong characters (these are not copied to the
				968	output buffer)
				969	"replace": replaces illegal characters with '?'
				970
				971	Returns 0 on success, -1 on failure.
				972
				973	*/
				974
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	975	/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	976	Py_UNICODE s, / Unicode buffer */
				977	Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
				978	char output, / Output buffer; must have size >= length */
				979	const char errors / error handling */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	980	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	981
				982	/* Transforms code points that have decimal digit property to the
				983	corresponding ASCII digit code points.
				984
				985	Returns a new Unicode string on success, NULL on failure.
				986	*/
				987
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	988	/* Py_DEPRECATED(3.3) */
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	989	PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
				990	Py_UNICODE s, / Unicode buffer */
				991	Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	992	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	993
				994	/* Coverts a Unicode object holding a decimal value to an ASCII string
				995	for using in int, float and complex parsers.
				996	Transforms code points that have decimal digit property to the
				997	corresponding ASCII digit code points. Transforms spaces to ASCII.
				998	Transforms code points starting from the first non-ASCII code point that
				999	is neither a decimal digit nor a space to the end into '?'. */
				1000
				1001	PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
				1002	PyObject unicode / Unicode object */
				1003	);
				1004
				1005	/* --- Methods & Slots ---------------------------------------------------- */
				1006
				1007	PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
				1008	PyObject *separator,
				1009	PyObject const items,
				1010	Py_ssize_t seqlen
				1011	);
				1012
				1013	/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
				1014	0 otherwise. The right argument must be ASCII identifier.
				1015	Any error occurs inside will be cleared before return. */
				1016	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
				1017	PyObject left, / Left string */
				1018	_Py_Identifier right / Right identifier */
				1019	);
				1020
				1021	/* Test whether a unicode is equal to ASCII string. Return 1 if true,
				1022	0 otherwise. The right argument must be ASCII-encoded string.
				1023	Any error occurs inside will be cleared before return. */
				1024	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
				1025	PyObject *left,
				1026	const char right / ASCII-encoded string */
				1027	);
				1028
				1029	/* Externally visible for str.strip(unicode) */
				1030	PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
				1031	PyObject *self,
				1032	int striptype,
				1033	PyObject *sepobj
				1034	);
				1035
				1036	/* Using explicit passed-in values, insert the thousands grouping
				1037	into the string pointed to by buffer. For the argument descriptions,
				1038	see Objects/stringlib/localeutil.h */
				1039	PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
				1040	_PyUnicodeWriter *writer,
				1041	Py_ssize_t n_buffer,
				1042	PyObject *digits,
				1043	Py_ssize_t d_pos,
				1044	Py_ssize_t n_digits,
				1045	Py_ssize_t min_width,
				1046	const char *grouping,
				1047	PyObject *thousands_sep,
				1048	Py_UCS4 *maxchar);
				1049
				1050	/* === Characters Type APIs =============================================== */
				1051
				1052	/* Helper array used by Py_UNICODE_ISSPACE(). */
				1053
				1054	PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
				1055
				1056	/* These should not be used directly. Use the Py_UNICODE_IS* and
				1057	Py_UNICODE_TO* macros instead.
				1058
				1059	These APIs are implemented in Objects/unicodectype.c.
				1060
				1061	*/
				1062
				1063	PyAPI_FUNC(int) _PyUnicode_IsLowercase(
				1064	Py_UCS4 ch /* Unicode character */
				1065	);
				1066
				1067	PyAPI_FUNC(int) _PyUnicode_IsUppercase(
				1068	Py_UCS4 ch /* Unicode character */
				1069	);
				1070
				1071	PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
				1072	Py_UCS4 ch /* Unicode character */
				1073	);
				1074
				1075	PyAPI_FUNC(int) _PyUnicode_IsXidStart(
				1076	Py_UCS4 ch /* Unicode character */
				1077	);
				1078
				1079	PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
				1080	Py_UCS4 ch /* Unicode character */
				1081	);
				1082
				1083	PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
				1084	const Py_UCS4 ch /* Unicode character */
				1085	);
				1086
				1087	PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
				1088	const Py_UCS4 ch /* Unicode character */
				1089	);
				1090
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1091	/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1092	Py_UCS4 ch /* Unicode character */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1093	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1094
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1095	/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1096	Py_UCS4 ch /* Unicode character */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1097	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1098
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1099	Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1100	Py_UCS4 ch /* Unicode character */
Zackery Spytz	3c8724f	2019-05-28 09:16:33 -0600	[diff] [blame]	1101	);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1102
				1103	PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
				1104	Py_UCS4 ch, /* Unicode character */
				1105	Py_UCS4 *res
				1106	);
				1107
				1108	PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
				1109	Py_UCS4 ch, /* Unicode character */
				1110	Py_UCS4 *res
				1111	);
				1112
				1113	PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
				1114	Py_UCS4 ch, /* Unicode character */
				1115	Py_UCS4 *res
				1116	);
				1117
				1118	PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
				1119	Py_UCS4 ch, /* Unicode character */
				1120	Py_UCS4 *res
				1121	);
				1122
				1123	PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
				1124	Py_UCS4 ch /* Unicode character */
				1125	);
				1126
				1127	PyAPI_FUNC(int) _PyUnicode_IsCased(
				1128	Py_UCS4 ch /* Unicode character */
				1129	);
				1130
				1131	PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
				1132	Py_UCS4 ch /* Unicode character */
				1133	);
				1134
				1135	PyAPI_FUNC(int) _PyUnicode_ToDigit(
				1136	Py_UCS4 ch /* Unicode character */
				1137	);
				1138
				1139	PyAPI_FUNC(double) _PyUnicode_ToNumeric(
				1140	Py_UCS4 ch /* Unicode character */
				1141	);
				1142
				1143	PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
				1144	Py_UCS4 ch /* Unicode character */
				1145	);
				1146
				1147	PyAPI_FUNC(int) _PyUnicode_IsDigit(
				1148	Py_UCS4 ch /* Unicode character */
				1149	);
				1150
				1151	PyAPI_FUNC(int) _PyUnicode_IsNumeric(
				1152	Py_UCS4 ch /* Unicode character */
				1153	);
				1154
				1155	PyAPI_FUNC(int) _PyUnicode_IsPrintable(
				1156	Py_UCS4 ch /* Unicode character */
				1157	);
				1158
				1159	PyAPI_FUNC(int) _PyUnicode_IsAlpha(
				1160	Py_UCS4 ch /* Unicode character */
				1161	);
				1162
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1163	PyAPI_FUNC(PyObject) _PyUnicode_FormatLong(PyObject , int, int, int);
				1164
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1165	/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
				1166	PyAPI_FUNC(PyObject) _PyUnicode_FromId(_Py_Identifier);
Victor Stinner	75e4699	2018-11-26 17:29:38 +0100	[diff] [blame]	1167
				1168	/* Fast equality check when the inputs are known to be exact unicode types
				1169	and where the hash values are equal (i.e. a very probable match) */
				1170	PyAPI_FUNC(int) _PyUnicode_EQ(PyObject , PyObject );
				1171
Serhiy Storchaka	74ea6b5	2020-05-12 12:42:04 +0300	[diff] [blame]	1172	PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);