blob: b0ac086a6be23de8c61299a0081049f1ed863212 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Guido van Rossumd8225182000-03-10 22:33:05 +000086/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000087 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000089
90#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000091# ifndef HAVE_WCHAR_H
92# define HAVE_WCHAR_H
93# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96#ifdef HAVE_WCHAR_H
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000097# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000098#endif
99
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200101 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Barry Warsaw51ac5802000-03-20 16:36:48 +0000106#ifdef __cplusplus
107extern "C" {
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110
Mark Hammond91a681d2002-08-12 07:21:58 +0000111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000113
Thomas Wouters27d517b2007-02-25 20:39:11 +0000114#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
Dong-hee Nad905df72020-02-14 02:37:17 +0900116#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000117
Guido van Rossumd8225182000-03-10 22:33:05 +0000118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000126
127/* === Public API ========================================================= */
128
Georg Brandl952867a2010-06-27 10:17:12 +0000129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000131 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000132 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000133 );
134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000138 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000139 );
140
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200146#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200149/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300150 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200164#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200165
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000167/* Get the length of the Unicode object. */
168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200172#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200173
Victor Stinner157f83f2011-09-28 21:41:31 +0200174/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200175 string representation. */
176
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600179 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000180
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200199#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200200
Martin Panter6245cb32016-04-15 02:14:19 +0000201/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000210
211 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000216
Mark Hammond91a681d2002-08-12 07:21:58 +0000217PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000220 );
221
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300222/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000223
Martin Panter20d32552016-04-15 00:56:21 +0000224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000227
Martin Panter20d32552016-04-15 00:56:21 +0000228 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233*/
234
Mark Hammond91a681d2002-08-12 07:21:58 +0000235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200236 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
Martin Panter20d32552016-04-15 00:56:21 +0000241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248*/
249
Mark Hammond91a681d2002-08-12 07:21:58 +0000250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200251 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000252 );
253
Victor Stinner1205f272010-09-11 00:54:47 +0000254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000262
Walter Dörwald16807132007-05-25 13:52:07 +0000263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000264PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265 const char *u /* UTF-8 encoded string */
266 );
Walter Dörwald16807132007-05-25 13:52:07 +0000267
Victor Stinner583ee5a2020-10-02 14:49:00 +0200268// PyUnicode_InternImmortal() is deprecated since Python 3.10
269// and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271
Walter Dörwald16807132007-05-25 13:52:07 +0000272/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273#define PyUnicode_CHECK_INTERNED(op) \
274 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000275
Guido van Rossumd8225182000-03-10 22:33:05 +0000276/* --- wchar_t support for platforms which support it --------------------- */
277
278#ifdef HAVE_WCHAR_H
279
Georg Brandl952867a2010-06-27 10:17:12 +0000280/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000281 size.
282
283 The buffer is copied into the new object. */
284
Mark Hammond91a681d2002-08-12 07:21:58 +0000285PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200286 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000287 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000288 );
289
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000290/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 most size wchar_t characters are copied.
292
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000293 Note that the resulting wchar_t string may or may not be
294 0-terminated. It is the responsibility of the caller to make sure
295 that the wchar_t string is 0-terminated in case this is required by
296 the application.
297
298 Returns the number of wchar_t characters copied (excluding a
299 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000300 error. */
301
Martin v. Löwis18e16552006-02-15 17:27:45 +0000302PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000303 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200304 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000306 );
307
Victor Stinner137c34c2010-09-29 10:25:54 +0000308/* Convert the Unicode object to a wide character string. The output string
309 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200310 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000311
Victor Stinner22fabe22015-02-11 18:17:56 +0100312 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +0000313 on success. On error, returns NULL, *size is undefined and raises a
314 MemoryError. */
315
316PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000317 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000318 Py_ssize_t *size /* number of characters of the result */
319 );
320
Guido van Rossumd8225182000-03-10 22:33:05 +0000321#endif
322
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000323/* --- Unicode ordinals --------------------------------------------------- */
324
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325/* Create a Unicode Object from the given Unicode code point ordinal.
326
Ezio Melottie7f90372012-10-05 03:33:31 +0300327 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000328 raised in case it is not.
329
330*/
331
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000332PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000333
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000335
336 Many of these APIs take two arguments encoding and errors. These
337 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000338 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000339
Georg Brandl952867a2010-06-27 10:17:12 +0000340 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000341
342 Error handling is set by errors which may also be set to NULL
343 meaning to use the default handling defined for the codec. Default
344 error handling for all builtin codecs is "strict" (ValueErrors are
345 raised).
346
347 The codecs all use a similar interface. Only deviation from the
348 generic ones are documented.
349
350*/
351
Fred Drakecb093fe2000-05-09 19:51:53 +0000352/* --- Manage the default encoding ---------------------------------------- */
353
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000354/* Returns "utf-8". */
Mark Hammond91a681d2002-08-12 07:21:58 +0000355PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000356
Guido van Rossumd8225182000-03-10 22:33:05 +0000357/* --- Generic Codecs ----------------------------------------------------- */
358
359/* Create a Unicode object by decoding the encoded string s of the
360 given size. */
361
Mark Hammond91a681d2002-08-12 07:21:58 +0000362PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000363 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000365 const char *encoding, /* encoding */
366 const char *errors /* error handling */
367 );
368
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000369/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300370 object.
371
372 This API is DEPRECATED. The only supported standard encoding is rot13.
373 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
374 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000375
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600376Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 PyObject *unicode, /* Unicode object */
378 const char *encoding, /* encoding */
379 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600380 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000381
382/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300383 object.
384
385 This API is DEPRECATED. The only supported standard encoding is rot13.
386 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
387 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000388
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600389Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 PyObject *unicode, /* Unicode object */
391 const char *encoding, /* encoding */
392 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600393 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000394
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000395/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300396 object.
397
Ville Skyttä49b27342017-08-03 09:00:59 +0300398 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
Serhiy Storchaka00939072016-10-27 21:05:49 +0300399 since all standard encodings (except rot13) encode str to bytes.
400 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
401 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000402
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600403Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyObject *unicode, /* Unicode object */
405 const char *encoding, /* encoding */
406 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600407 );
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000408
Guido van Rossumd8225182000-03-10 22:33:05 +0000409/* Encodes a Unicode object and returns the result as Python string
410 object. */
411
Mark Hammond91a681d2002-08-12 07:21:58 +0000412PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000413 PyObject *unicode, /* Unicode object */
414 const char *encoding, /* encoding */
415 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000416 );
417
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000418/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300419 object.
420
421 This API is DEPRECATED. The only supported standard encodings is rot13.
422 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
423 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000424
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600425Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyObject *unicode, /* Unicode object */
427 const char *encoding, /* encoding */
428 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600429 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000430
431/* Build an encoding map. */
432
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000433PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
434 PyObject* string /* 256 character map */
435 );
436
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000437/* --- UTF-7 Codecs ------------------------------------------------------- */
438
Mark Hammond91a681d2002-08-12 07:21:58 +0000439PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440 const char *string, /* UTF-7 encoded string */
441 Py_ssize_t length, /* size of string */
442 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000443 );
444
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000445PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000446 const char *string, /* UTF-7 encoded string */
447 Py_ssize_t length, /* size of string */
448 const char *errors, /* error handling */
449 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000450 );
451
Guido van Rossumd8225182000-03-10 22:33:05 +0000452/* --- UTF-8 Codecs ------------------------------------------------------- */
453
Mark Hammond91a681d2002-08-12 07:21:58 +0000454PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000455 const char *string, /* UTF-8 encoded string */
456 Py_ssize_t length, /* size of string */
457 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000458 );
459
Walter Dörwald69652032004-09-07 20:24:22 +0000460PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000461 const char *string, /* UTF-8 encoded string */
462 Py_ssize_t length, /* size of string */
463 const char *errors, /* error handling */
464 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000465 );
466
Mark Hammond91a681d2002-08-12 07:21:58 +0000467PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000469 );
470
Alex Gaynor3a8fdb22020-10-19 18:17:50 -0400471/* Returns a pointer to the default encoding (UTF-8) of the
472 Unicode object unicode and the size of the encoded representation
473 in bytes stored in *size.
474
475 In case of an error, no *size is set.
476
477 This function caches the UTF-8 encoded string in the unicodeobject
478 and subsequent calls will return the same string. The memory is released
479 when the unicodeobject is deallocated.
480*/
481
482#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
483PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
484 PyObject *unicode,
485 Py_ssize_t *size);
486#endif
487
Walter Dörwald41980ca2007-08-16 21:55:45 +0000488/* --- UTF-32 Codecs ------------------------------------------------------ */
489
490/* Decodes length bytes from a UTF-32 encoded buffer string and returns
491 the corresponding Unicode object.
492
493 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000494 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000495
496 If byteorder is non-NULL, the decoder starts decoding using the
497 given byte order:
498
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 *byteorder == -1: little endian
500 *byteorder == 0: native order
501 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000502
503 In native mode, the first four bytes of the stream are checked for a
504 BOM mark. If found, the BOM mark is analysed, the byte order
505 adjusted and the BOM skipped. In the other modes, no BOM mark
506 interpretation is done. After completion, *byteorder is set to the
507 current byte order at the end of input data.
508
509 If byteorder is NULL, the codec starts in native order mode.
510
511*/
512
513PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 const char *string, /* UTF-32 encoded string */
515 Py_ssize_t length, /* size of string */
516 const char *errors, /* error handling */
517 int *byteorder /* pointer to byteorder to use
518 0=native;-1=LE,1=BE; updated on
519 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000520 );
521
522PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000523 const char *string, /* UTF-32 encoded string */
524 Py_ssize_t length, /* size of string */
525 const char *errors, /* error handling */
526 int *byteorder, /* pointer to byteorder to use
527 0=native;-1=LE,1=BE; updated on
528 exit */
529 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000530 );
531
532/* Returns a Python string using the UTF-32 encoding in native byte
533 order. The string always starts with a BOM mark. */
534
535PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000536 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000537 );
538
539/* Returns a Python string object holding the UTF-32 encoded value of
540 the Unicode data.
541
542 If byteorder is not 0, output is written according to the following
543 byte order:
544
545 byteorder == -1: little endian
546 byteorder == 0: native byte order (writes a BOM mark)
547 byteorder == 1: big endian
548
549 If byteorder is 0, the output string will always start with the
550 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
551 prepended.
552
553*/
554
Guido van Rossumd8225182000-03-10 22:33:05 +0000555/* --- UTF-16 Codecs ------------------------------------------------------ */
556
Guido van Rossum9e896b32000-04-05 20:11:21 +0000557/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000558 the corresponding Unicode object.
559
560 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000562
563 If byteorder is non-NULL, the decoder starts decoding using the
564 given byte order:
565
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 *byteorder == -1: little endian
567 *byteorder == 0: native order
568 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000569
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000570 In native mode, the first two bytes of the stream are checked for a
571 BOM mark. If found, the BOM mark is analysed, the byte order
572 adjusted and the BOM skipped. In the other modes, no BOM mark
573 interpretation is done. After completion, *byteorder is set to the
574 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000575
576 If byteorder is NULL, the codec starts in native order mode.
577
578*/
579
Mark Hammond91a681d2002-08-12 07:21:58 +0000580PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000581 const char *string, /* UTF-16 encoded string */
582 Py_ssize_t length, /* size of string */
583 const char *errors, /* error handling */
584 int *byteorder /* pointer to byteorder to use
585 0=native;-1=LE,1=BE; updated on
586 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000587 );
588
Walter Dörwald69652032004-09-07 20:24:22 +0000589PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000590 const char *string, /* UTF-16 encoded string */
591 Py_ssize_t length, /* size of string */
592 const char *errors, /* error handling */
593 int *byteorder, /* pointer to byteorder to use
594 0=native;-1=LE,1=BE; updated on
595 exit */
596 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000597 );
598
Guido van Rossumd8225182000-03-10 22:33:05 +0000599/* Returns a Python string using the UTF-16 encoding in native byte
600 order. The string always starts with a BOM mark. */
601
Mark Hammond91a681d2002-08-12 07:21:58 +0000602PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000603 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000604 );
605
Guido van Rossumd8225182000-03-10 22:33:05 +0000606/* --- Unicode-Escape Codecs ---------------------------------------------- */
607
Mark Hammond91a681d2002-08-12 07:21:58 +0000608PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000609 const char *string, /* Unicode-Escape encoded string */
610 Py_ssize_t length, /* size of string */
611 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000612 );
613
Mark Hammond91a681d2002-08-12 07:21:58 +0000614PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000616 );
617
Guido van Rossumd8225182000-03-10 22:33:05 +0000618/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
619
Mark Hammond91a681d2002-08-12 07:21:58 +0000620PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 const char *string, /* Raw-Unicode-Escape encoded string */
622 Py_ssize_t length, /* size of string */
623 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000624 );
625
Mark Hammond91a681d2002-08-12 07:21:58 +0000626PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000628 );
629
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000631
Victor Stinner75e46992018-11-26 17:29:38 +0100632 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000633
Mark Hammond91a681d2002-08-12 07:21:58 +0000634PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 const char *string, /* Latin-1 encoded string */
636 Py_ssize_t length, /* size of string */
637 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000638 );
639
Mark Hammond91a681d2002-08-12 07:21:58 +0000640PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000641 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000642 );
643
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000645
646 Only 7-bit ASCII data is excepted. All other codes generate errors.
647
648*/
649
Mark Hammond91a681d2002-08-12 07:21:58 +0000650PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 const char *string, /* ASCII encoded string */
652 Py_ssize_t length, /* size of string */
653 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000654 );
655
Mark Hammond91a681d2002-08-12 07:21:58 +0000656PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000658 );
659
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000660/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000661
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +0000663
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200664 Decoding mappings must map byte ordinals (integers in the range from 0 to
665 255) to Unicode strings, integers (which are then interpreted as Unicode
666 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
667 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
668 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000669
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200670 Encoding mappings must map Unicode ordinal integers to bytes objects,
671 integers in the range from 0 to 255 or None. Unmapped character
672 ordinals (ones which cause a LookupError) as well as mapped to
673 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000674
675*/
676
Mark Hammond91a681d2002-08-12 07:21:58 +0000677PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 const char *string, /* Encoded string */
679 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200680 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000681 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000682 );
683
Mark Hammond91a681d2002-08-12 07:21:58 +0000684PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000685 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200686 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +0000687 );
688
Guido van Rossumefec1152000-03-28 02:01:15 +0000689/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000690
Victor Stinner75e46992018-11-26 17:29:38 +0100691#ifdef MS_WINDOWS
Mark Hammond91a681d2002-08-12 07:21:58 +0000692PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000693 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -0700694 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +0000695 const char *errors /* error handling */
696 );
697
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000698PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
699 const char *string, /* MBCS encoded string */
700 Py_ssize_t length, /* size of string */
701 const char *errors, /* error handling */
702 Py_ssize_t *consumed /* bytes consumed */
703 );
704
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200705#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200706PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
707 int code_page, /* code page number */
708 const char *string, /* encoded string */
709 Py_ssize_t length, /* size of string */
710 const char *errors, /* error handling */
711 Py_ssize_t *consumed /* bytes consumed */
712 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200713#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200714
Mark Hammond91a681d2002-08-12 07:21:58 +0000715PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +0000716 PyObject *unicode /* Unicode object */
717 );
718
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200719#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200720PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
721 int code_page, /* code page number */
722 PyObject *unicode, /* Unicode object */
723 const char *errors /* error handling */
724 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200725#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200726
Steve Dowercc16be82016-09-08 10:35:16 -0700727#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000728
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100729/* --- Locale encoding --------------------------------------------------- */
730
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200731#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100732/* Decode a string from the current locale encoding. The decoder is strict if
733 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
734 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
735 be decoded as a surrogate character and *surrogateescape* is not equal to
736 zero, the byte sequence is escaped using the 'surrogateescape' error handler
737 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100738 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100739
740PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
741 const char *str,
742 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +0100743 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100744
745/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
746 length using strlen(). */
747
748PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
749 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +0100750 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100751
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100752/* Encode a Unicode object to the current locale encoding. The encoder is
753 strict is *surrogateescape* is equal to zero, otherwise the
754 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +0100755 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100756
757PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
758 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +0100759 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100760 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200761#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100762
Martin v. Löwis011e8422009-05-05 04:43:17 +0000763/* --- File system encoding ---------------------------------------------- */
764
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000765/* ParseTuple converter: encode str objects to bytes using
766 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +0000767
768PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
769
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000770/* ParseTuple converter: decode bytes objects to unicode using
771 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
772
773PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
774
Victor Stinner77c38622010-05-14 15:58:55 +0000775/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
776 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000777
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000778 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
779 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000780
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000781 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000782*/
783
784PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
785 const char *s /* encoded string */
786 );
787
Victor Stinner77c38622010-05-14 15:58:55 +0000788/* Decode a string using Py_FileSystemDefaultEncoding
789 and the "surrogateescape" error handler.
790
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000791 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
792 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +0000793*/
794
Martin v. Löwis011e8422009-05-05 04:43:17 +0000795PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
796 const char *s, /* encoded string */
797 Py_ssize_t size /* size */
798 );
799
Victor Stinnerae6265f2010-05-15 16:27:27 +0000800/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000801 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000802
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000803 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
804 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000805*/
806
807PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
808 PyObject *unicode
809 );
810
Guido van Rossumd8225182000-03-10 22:33:05 +0000811/* --- Methods & Slots ----------------------------------------------------
812
813 These are capable of handling Unicode objects and strings on input
814 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200815 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000816
817/* Concat two strings giving a new Unicode string. */
818
Mark Hammond91a681d2002-08-12 07:21:58 +0000819PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 PyObject *left, /* Left string */
821 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 );
823
Walter Dörwald1ab83302007-05-18 17:15:44 +0000824/* Concat two strings and put the result in *pleft
825 (sets *pleft to NULL on error) */
826
827PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 PyObject **pleft, /* Pointer to left string */
829 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000830 );
831
832/* Concat two strings, put the result in *pleft and drop the right object
833 (sets *pleft to NULL on error) */
834
835PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 PyObject **pleft, /* Pointer to left string */
837 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000838 );
839
Guido van Rossumd8225182000-03-10 22:33:05 +0000840/* Split a string giving a list of Unicode strings.
841
842 If sep is NULL, splitting will be done at all whitespace
843 substrings. Otherwise, splits occur at the given separator.
844
845 At most maxsplit splits will be done. If negative, no limit is set.
846
847 Separators are not included in the resulting list.
848
849*/
850
Mark Hammond91a681d2002-08-12 07:21:58 +0000851PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 PyObject *s, /* String to split */
853 PyObject *sep, /* String separator */
854 Py_ssize_t maxsplit /* Maxsplit count */
855 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000856
857/* Dito, but split at line breaks.
858
859 CRLF is considered to be one line break. Line breaks are not
860 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861
Mark Hammond91a681d2002-08-12 07:21:58 +0000862PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000863 PyObject *s, /* String to split */
864 int keepends /* If true, line end markers are included */
865 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000866
Thomas Wouters477c8d52006-05-27 19:21:47 +0000867/* Partition a string using a given separator. */
868
869PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 PyObject *s, /* String to partition */
871 PyObject *sep /* String separator */
872 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873
874/* Partition a string using a given separator, searching from the end of the
875 string. */
876
877PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000878 PyObject *s, /* String to partition */
879 PyObject *sep /* String separator */
880 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000882/* Split a string giving a list of Unicode strings.
883
884 If sep is NULL, splitting will be done at all whitespace
885 substrings. Otherwise, splits occur at the given separator.
886
887 At most maxsplit splits will be done. But unlike PyUnicode_Split
888 PyUnicode_RSplit splits from the end of the string. If negative,
889 no limit is set.
890
891 Separators are not included in the resulting list.
892
893*/
894
895PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 PyObject *s, /* String to split */
897 PyObject *sep, /* String separator */
898 Py_ssize_t maxsplit /* Maxsplit count */
899 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000900
Guido van Rossumd8225182000-03-10 22:33:05 +0000901/* Translate a string by applying a character mapping table to it and
902 return the resulting Unicode object.
903
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200904 The mapping table must map Unicode ordinal integers to Unicode strings,
905 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +0000906
907 Mapping tables may be dictionaries or sequences. Unmapped character
908 ordinals (ones which cause a LookupError) are left untouched and
909 are copied as-is.
910
911*/
912
Mark Hammond91a681d2002-08-12 07:21:58 +0000913PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 PyObject *str, /* String */
915 PyObject *table, /* Translate table */
916 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000917 );
918
919/* Join a sequence of strings using the given separator and return
920 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921
Mark Hammond91a681d2002-08-12 07:21:58 +0000922PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 PyObject *separator, /* Separator string */
924 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000925 );
926
927/* Return 1 if substr matches str[start:end] at the given tail end, 0
928 otherwise. */
929
Martin v. Löwis18e16552006-02-15 17:27:45 +0000930PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931 PyObject *str, /* String */
932 PyObject *substr, /* Prefix or Suffix string */
933 Py_ssize_t start, /* Start index */
934 Py_ssize_t end, /* Stop index */
935 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +0000936 );
937
938/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +0000939 given search direction or -1 if not found. -2 is returned in case
940 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000941
Martin v. Löwis18e16552006-02-15 17:27:45 +0000942PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000943 PyObject *str, /* String */
944 PyObject *substr, /* Substring to find */
945 Py_ssize_t start, /* Start index */
946 Py_ssize_t end, /* Stop index */
947 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +0000948 );
949
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200950#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951/* Like PyUnicode_Find, but search for single character only. */
952PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
953 PyObject *str,
954 Py_UCS4 ch,
955 Py_ssize_t start,
956 Py_ssize_t end,
957 int direction
958 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200959#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200960
Barry Warsaw51ac5802000-03-20 16:36:48 +0000961/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000962
Martin v. Löwis18e16552006-02-15 17:27:45 +0000963PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 PyObject *str, /* String */
965 PyObject *substr, /* Substring to count */
966 Py_ssize_t start, /* Start index */
967 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +0000968 );
969
Barry Warsaw51ac5802000-03-20 16:36:48 +0000970/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000971 and return the resulting Unicode object. */
972
Mark Hammond91a681d2002-08-12 07:21:58 +0000973PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000974 PyObject *str, /* String */
975 PyObject *substr, /* Substring to find */
976 PyObject *replstr, /* Substring to replace */
977 Py_ssize_t maxcount /* Max. number of replacements to apply;
978 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +0000979 );
980
981/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +0200982 greater than resp.
983 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000984
Mark Hammond91a681d2002-08-12 07:21:58 +0000985PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000986 PyObject *left, /* Left string */
987 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000988 );
989
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200990/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
991 equal, and greater than, respectively. It is best to pass only
992 ASCII-encoded strings, but the function interprets the input string as
993 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +0200994 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200995
Martin v. Löwis5b222132007-06-10 09:51:05 +0000996PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
997 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000998 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +0000999 );
1000
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001001/* Rich compare two strings and return one of the following:
1002
1003 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +00001004 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001005 - Py_NotImplemented in case the type combination is unknown
1006
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001007 Possible values for op:
1008
1009 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1010
1011*/
1012
1013PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 PyObject *left, /* Left string */
1015 PyObject *right, /* Right string */
1016 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001017 );
1018
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001019/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001020 the resulting Unicode string. */
1021
Mark Hammond91a681d2002-08-12 07:21:58 +00001022PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 PyObject *format, /* Format string */
1024 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001025 );
1026
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001027/* Checks whether element is contained in container and return 1/0
1028 accordingly.
1029
Martin Pantercc71a792016-04-05 06:19:42 +00001030 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001031 returned in case of an error. */
1032
Mark Hammond91a681d2002-08-12 07:21:58 +00001033PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001034 PyObject *container, /* Container string */
1035 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001036 );
1037
Martin v. Löwis47383402007-08-15 07:32:56 +00001038/* Checks whether argument is a valid identifier. */
1039
1040PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1041
Guido van Rossumd8225182000-03-10 22:33:05 +00001042/* === Characters Type APIs =============================================== */
1043
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03001044#ifndef Py_LIMITED_API
Victor Stinner75e46992018-11-26 17:29:38 +01001045# define Py_CPYTHON_UNICODEOBJECT_H
1046# include "cpython/unicodeobject.h"
1047# undef Py_CPYTHON_UNICODEOBJECT_H
1048#endif
Raymond Hettingerac2ef652015-07-04 16:04:44 -07001049
Guido van Rossumd8225182000-03-10 22:33:05 +00001050#ifdef __cplusplus
1051}
1052#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001053#endif /* !Py_UNICODEOBJECT_H */