blob: 90b3299fd26ceb419a8aaeb0c51b5ab760b1a715 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Guido van Rossumd8225182000-03-10 22:33:05 +000086/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000087 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000089
90#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000091# ifndef HAVE_WCHAR_H
92# define HAVE_WCHAR_H
93# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96#ifdef HAVE_WCHAR_H
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000097# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000098#endif
99
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200101 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Barry Warsaw51ac5802000-03-20 16:36:48 +0000106#ifdef __cplusplus
107extern "C" {
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110
Mark Hammond91a681d2002-08-12 07:21:58 +0000111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000113
Thomas Wouters27d517b2007-02-25 20:39:11 +0000114#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
Dong-hee Nad905df72020-02-14 02:37:17 +0900116#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000117
Guido van Rossumd8225182000-03-10 22:33:05 +0000118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000126
127/* === Public API ========================================================= */
128
Georg Brandl952867a2010-06-27 10:17:12 +0000129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000131 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000132 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000133 );
134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000138 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000139 );
140
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200146#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200149/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300150 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200164#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200165
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000167/* Get the length of the Unicode object. */
168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200172#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200173
Victor Stinner157f83f2011-09-28 21:41:31 +0200174/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200175 string representation. */
176
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600179 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000180
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200199#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200200
Martin Panter6245cb32016-04-15 02:14:19 +0000201/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000210
211 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000216
Mark Hammond91a681d2002-08-12 07:21:58 +0000217PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000220 );
221
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300222/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000223
Martin Panter20d32552016-04-15 00:56:21 +0000224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000227
Martin Panter20d32552016-04-15 00:56:21 +0000228 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233*/
234
Mark Hammond91a681d2002-08-12 07:21:58 +0000235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200236 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
Martin Panter20d32552016-04-15 00:56:21 +0000241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248*/
249
Mark Hammond91a681d2002-08-12 07:21:58 +0000250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200251 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000252 );
253
Victor Stinner1205f272010-09-11 00:54:47 +0000254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000262
Walter Dörwald16807132007-05-25 13:52:07 +0000263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000264PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265 const char *u /* UTF-8 encoded string */
266 );
Walter Dörwald16807132007-05-25 13:52:07 +0000267
Victor Stinner583ee5a2020-10-02 14:49:00 +0200268// PyUnicode_InternImmortal() is deprecated since Python 3.10
269// and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271
Walter Dörwald16807132007-05-25 13:52:07 +0000272/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273#define PyUnicode_CHECK_INTERNED(op) \
274 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000275
Guido van Rossumd8225182000-03-10 22:33:05 +0000276/* --- wchar_t support for platforms which support it --------------------- */
277
278#ifdef HAVE_WCHAR_H
279
Georg Brandl952867a2010-06-27 10:17:12 +0000280/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000281 size.
282
283 The buffer is copied into the new object. */
284
Mark Hammond91a681d2002-08-12 07:21:58 +0000285PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200286 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000287 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000288 );
289
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000290/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 most size wchar_t characters are copied.
292
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000293 Note that the resulting wchar_t string may or may not be
294 0-terminated. It is the responsibility of the caller to make sure
295 that the wchar_t string is 0-terminated in case this is required by
296 the application.
297
298 Returns the number of wchar_t characters copied (excluding a
299 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000300 error. */
301
Martin v. Löwis18e16552006-02-15 17:27:45 +0000302PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000303 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200304 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000306 );
307
Victor Stinner137c34c2010-09-29 10:25:54 +0000308/* Convert the Unicode object to a wide character string. The output string
309 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200310 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000311
Victor Stinner22fabe22015-02-11 18:17:56 +0100312 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +0000313 on success. On error, returns NULL, *size is undefined and raises a
314 MemoryError. */
315
316PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000317 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000318 Py_ssize_t *size /* number of characters of the result */
319 );
320
Guido van Rossumd8225182000-03-10 22:33:05 +0000321#endif
322
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000323/* --- Unicode ordinals --------------------------------------------------- */
324
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325/* Create a Unicode Object from the given Unicode code point ordinal.
326
Ezio Melottie7f90372012-10-05 03:33:31 +0300327 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000328 raised in case it is not.
329
330*/
331
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000332PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000333
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000335
336 Many of these APIs take two arguments encoding and errors. These
337 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000338 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000339
Georg Brandl952867a2010-06-27 10:17:12 +0000340 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000341
342 Error handling is set by errors which may also be set to NULL
343 meaning to use the default handling defined for the codec. Default
344 error handling for all builtin codecs is "strict" (ValueErrors are
345 raised).
346
347 The codecs all use a similar interface. Only deviation from the
348 generic ones are documented.
349
350*/
351
Fred Drakecb093fe2000-05-09 19:51:53 +0000352/* --- Manage the default encoding ---------------------------------------- */
353
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000354/* Returns "utf-8". */
Mark Hammond91a681d2002-08-12 07:21:58 +0000355PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000356
Guido van Rossumd8225182000-03-10 22:33:05 +0000357/* --- Generic Codecs ----------------------------------------------------- */
358
359/* Create a Unicode object by decoding the encoded string s of the
360 given size. */
361
Mark Hammond91a681d2002-08-12 07:21:58 +0000362PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000363 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000365 const char *encoding, /* encoding */
366 const char *errors /* error handling */
367 );
368
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000369/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300370 object.
371
372 This API is DEPRECATED. The only supported standard encoding is rot13.
373 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
374 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000375
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600376Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 PyObject *unicode, /* Unicode object */
378 const char *encoding, /* encoding */
379 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600380 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000381
382/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300383 object.
384
385 This API is DEPRECATED. The only supported standard encoding is rot13.
386 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
387 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000388
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600389Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 PyObject *unicode, /* Unicode object */
391 const char *encoding, /* encoding */
392 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600393 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000394
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000395/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300396 object.
397
Ville Skyttä49b27342017-08-03 09:00:59 +0300398 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
Serhiy Storchaka00939072016-10-27 21:05:49 +0300399 since all standard encodings (except rot13) encode str to bytes.
400 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
401 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000402
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600403Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyObject *unicode, /* Unicode object */
405 const char *encoding, /* encoding */
406 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600407 );
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000408
Guido van Rossumd8225182000-03-10 22:33:05 +0000409/* Encodes a Unicode object and returns the result as Python string
410 object. */
411
Mark Hammond91a681d2002-08-12 07:21:58 +0000412PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000413 PyObject *unicode, /* Unicode object */
414 const char *encoding, /* encoding */
415 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000416 );
417
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000418/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300419 object.
420
421 This API is DEPRECATED. The only supported standard encodings is rot13.
422 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
423 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000424
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600425Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyObject *unicode, /* Unicode object */
427 const char *encoding, /* encoding */
428 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600429 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000430
431/* Build an encoding map. */
432
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000433PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
434 PyObject* string /* 256 character map */
435 );
436
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000437/* --- UTF-7 Codecs ------------------------------------------------------- */
438
Mark Hammond91a681d2002-08-12 07:21:58 +0000439PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440 const char *string, /* UTF-7 encoded string */
441 Py_ssize_t length, /* size of string */
442 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000443 );
444
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000445PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000446 const char *string, /* UTF-7 encoded string */
447 Py_ssize_t length, /* size of string */
448 const char *errors, /* error handling */
449 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000450 );
451
Guido van Rossumd8225182000-03-10 22:33:05 +0000452/* --- UTF-8 Codecs ------------------------------------------------------- */
453
Mark Hammond91a681d2002-08-12 07:21:58 +0000454PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000455 const char *string, /* UTF-8 encoded string */
456 Py_ssize_t length, /* size of string */
457 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000458 );
459
Walter Dörwald69652032004-09-07 20:24:22 +0000460PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000461 const char *string, /* UTF-8 encoded string */
462 Py_ssize_t length, /* size of string */
463 const char *errors, /* error handling */
464 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000465 );
466
Mark Hammond91a681d2002-08-12 07:21:58 +0000467PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000469 );
470
Walter Dörwald41980ca2007-08-16 21:55:45 +0000471/* --- UTF-32 Codecs ------------------------------------------------------ */
472
473/* Decodes length bytes from a UTF-32 encoded buffer string and returns
474 the corresponding Unicode object.
475
476 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478
479 If byteorder is non-NULL, the decoder starts decoding using the
480 given byte order:
481
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 *byteorder == -1: little endian
483 *byteorder == 0: native order
484 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485
486 In native mode, the first four bytes of the stream are checked for a
487 BOM mark. If found, the BOM mark is analysed, the byte order
488 adjusted and the BOM skipped. In the other modes, no BOM mark
489 interpretation is done. After completion, *byteorder is set to the
490 current byte order at the end of input data.
491
492 If byteorder is NULL, the codec starts in native order mode.
493
494*/
495
496PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 const char *string, /* UTF-32 encoded string */
498 Py_ssize_t length, /* size of string */
499 const char *errors, /* error handling */
500 int *byteorder /* pointer to byteorder to use
501 0=native;-1=LE,1=BE; updated on
502 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503 );
504
505PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 const char *string, /* UTF-32 encoded string */
507 Py_ssize_t length, /* size of string */
508 const char *errors, /* error handling */
509 int *byteorder, /* pointer to byteorder to use
510 0=native;-1=LE,1=BE; updated on
511 exit */
512 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513 );
514
515/* Returns a Python string using the UTF-32 encoding in native byte
516 order. The string always starts with a BOM mark. */
517
518PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000520 );
521
522/* Returns a Python string object holding the UTF-32 encoded value of
523 the Unicode data.
524
525 If byteorder is not 0, output is written according to the following
526 byte order:
527
528 byteorder == -1: little endian
529 byteorder == 0: native byte order (writes a BOM mark)
530 byteorder == 1: big endian
531
532 If byteorder is 0, the output string will always start with the
533 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
534 prepended.
535
536*/
537
Guido van Rossumd8225182000-03-10 22:33:05 +0000538/* --- UTF-16 Codecs ------------------------------------------------------ */
539
Guido van Rossum9e896b32000-04-05 20:11:21 +0000540/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000541 the corresponding Unicode object.
542
543 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000544 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000545
546 If byteorder is non-NULL, the decoder starts decoding using the
547 given byte order:
548
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000549 *byteorder == -1: little endian
550 *byteorder == 0: native order
551 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000552
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000553 In native mode, the first two bytes of the stream are checked for a
554 BOM mark. If found, the BOM mark is analysed, the byte order
555 adjusted and the BOM skipped. In the other modes, no BOM mark
556 interpretation is done. After completion, *byteorder is set to the
557 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000558
559 If byteorder is NULL, the codec starts in native order mode.
560
561*/
562
Mark Hammond91a681d2002-08-12 07:21:58 +0000563PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000564 const char *string, /* UTF-16 encoded string */
565 Py_ssize_t length, /* size of string */
566 const char *errors, /* error handling */
567 int *byteorder /* pointer to byteorder to use
568 0=native;-1=LE,1=BE; updated on
569 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000570 );
571
Walter Dörwald69652032004-09-07 20:24:22 +0000572PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000573 const char *string, /* UTF-16 encoded string */
574 Py_ssize_t length, /* size of string */
575 const char *errors, /* error handling */
576 int *byteorder, /* pointer to byteorder to use
577 0=native;-1=LE,1=BE; updated on
578 exit */
579 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000580 );
581
Guido van Rossumd8225182000-03-10 22:33:05 +0000582/* Returns a Python string using the UTF-16 encoding in native byte
583 order. The string always starts with a BOM mark. */
584
Mark Hammond91a681d2002-08-12 07:21:58 +0000585PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000587 );
588
Guido van Rossumd8225182000-03-10 22:33:05 +0000589/* --- Unicode-Escape Codecs ---------------------------------------------- */
590
Mark Hammond91a681d2002-08-12 07:21:58 +0000591PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000592 const char *string, /* Unicode-Escape encoded string */
593 Py_ssize_t length, /* size of string */
594 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000595 );
596
Mark Hammond91a681d2002-08-12 07:21:58 +0000597PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000599 );
600
Guido van Rossumd8225182000-03-10 22:33:05 +0000601/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
602
Mark Hammond91a681d2002-08-12 07:21:58 +0000603PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 const char *string, /* Raw-Unicode-Escape encoded string */
605 Py_ssize_t length, /* size of string */
606 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000607 );
608
Mark Hammond91a681d2002-08-12 07:21:58 +0000609PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000611 );
612
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000613/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000614
Victor Stinner75e46992018-11-26 17:29:38 +0100615 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000616
Mark Hammond91a681d2002-08-12 07:21:58 +0000617PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000618 const char *string, /* Latin-1 encoded string */
619 Py_ssize_t length, /* size of string */
620 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000621 );
622
Mark Hammond91a681d2002-08-12 07:21:58 +0000623PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000625 );
626
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000628
629 Only 7-bit ASCII data is excepted. All other codes generate errors.
630
631*/
632
Mark Hammond91a681d2002-08-12 07:21:58 +0000633PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000634 const char *string, /* ASCII encoded string */
635 Py_ssize_t length, /* size of string */
636 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000637 );
638
Mark Hammond91a681d2002-08-12 07:21:58 +0000639PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000641 );
642
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000644
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +0000646
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200647 Decoding mappings must map byte ordinals (integers in the range from 0 to
648 255) to Unicode strings, integers (which are then interpreted as Unicode
649 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
650 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
651 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000652
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200653 Encoding mappings must map Unicode ordinal integers to bytes objects,
654 integers in the range from 0 to 255 or None. Unmapped character
655 ordinals (ones which cause a LookupError) as well as mapped to
656 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000657
658*/
659
Mark Hammond91a681d2002-08-12 07:21:58 +0000660PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 const char *string, /* Encoded string */
662 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200663 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000665 );
666
Mark Hammond91a681d2002-08-12 07:21:58 +0000667PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200669 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +0000670 );
671
Guido van Rossumefec1152000-03-28 02:01:15 +0000672/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000673
Victor Stinner75e46992018-11-26 17:29:38 +0100674#ifdef MS_WINDOWS
Mark Hammond91a681d2002-08-12 07:21:58 +0000675PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000676 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -0700677 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +0000678 const char *errors /* error handling */
679 );
680
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000681PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
682 const char *string, /* MBCS encoded string */
683 Py_ssize_t length, /* size of string */
684 const char *errors, /* error handling */
685 Py_ssize_t *consumed /* bytes consumed */
686 );
687
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200688#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200689PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
690 int code_page, /* code page number */
691 const char *string, /* encoded string */
692 Py_ssize_t length, /* size of string */
693 const char *errors, /* error handling */
694 Py_ssize_t *consumed /* bytes consumed */
695 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200696#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200697
Mark Hammond91a681d2002-08-12 07:21:58 +0000698PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +0000699 PyObject *unicode /* Unicode object */
700 );
701
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200702#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200703PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
704 int code_page, /* code page number */
705 PyObject *unicode, /* Unicode object */
706 const char *errors /* error handling */
707 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200708#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200709
Steve Dowercc16be82016-09-08 10:35:16 -0700710#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000711
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100712/* --- Locale encoding --------------------------------------------------- */
713
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200714#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100715/* Decode a string from the current locale encoding. The decoder is strict if
716 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
717 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
718 be decoded as a surrogate character and *surrogateescape* is not equal to
719 zero, the byte sequence is escaped using the 'surrogateescape' error handler
720 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100721 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100722
723PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
724 const char *str,
725 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +0100726 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100727
728/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
729 length using strlen(). */
730
731PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
732 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +0100733 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100734
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100735/* Encode a Unicode object to the current locale encoding. The encoder is
736 strict is *surrogateescape* is equal to zero, otherwise the
737 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +0100738 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100739
740PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
741 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +0100742 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100743 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200744#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100745
Martin v. Löwis011e8422009-05-05 04:43:17 +0000746/* --- File system encoding ---------------------------------------------- */
747
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000748/* ParseTuple converter: encode str objects to bytes using
749 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +0000750
751PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
752
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000753/* ParseTuple converter: decode bytes objects to unicode using
754 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
755
756PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
757
Victor Stinner77c38622010-05-14 15:58:55 +0000758/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
759 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000760
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000761 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
762 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000763
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000764 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000765*/
766
767PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
768 const char *s /* encoded string */
769 );
770
Victor Stinner77c38622010-05-14 15:58:55 +0000771/* Decode a string using Py_FileSystemDefaultEncoding
772 and the "surrogateescape" error handler.
773
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000774 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
775 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +0000776*/
777
Martin v. Löwis011e8422009-05-05 04:43:17 +0000778PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
779 const char *s, /* encoded string */
780 Py_ssize_t size /* size */
781 );
782
Victor Stinnerae6265f2010-05-15 16:27:27 +0000783/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000784 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000785
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000786 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
787 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000788*/
789
790PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
791 PyObject *unicode
792 );
793
Guido van Rossumd8225182000-03-10 22:33:05 +0000794/* --- Methods & Slots ----------------------------------------------------
795
796 These are capable of handling Unicode objects and strings on input
797 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200798 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000799
800/* Concat two strings giving a new Unicode string. */
801
Mark Hammond91a681d2002-08-12 07:21:58 +0000802PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 PyObject *left, /* Left string */
804 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000805 );
806
Walter Dörwald1ab83302007-05-18 17:15:44 +0000807/* Concat two strings and put the result in *pleft
808 (sets *pleft to NULL on error) */
809
810PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 PyObject **pleft, /* Pointer to left string */
812 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000813 );
814
815/* Concat two strings, put the result in *pleft and drop the right object
816 (sets *pleft to NULL on error) */
817
818PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 PyObject **pleft, /* Pointer to left string */
820 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000821 );
822
Guido van Rossumd8225182000-03-10 22:33:05 +0000823/* Split a string giving a list of Unicode strings.
824
825 If sep is NULL, splitting will be done at all whitespace
826 substrings. Otherwise, splits occur at the given separator.
827
828 At most maxsplit splits will be done. If negative, no limit is set.
829
830 Separators are not included in the resulting list.
831
832*/
833
Mark Hammond91a681d2002-08-12 07:21:58 +0000834PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 PyObject *s, /* String to split */
836 PyObject *sep, /* String separator */
837 Py_ssize_t maxsplit /* Maxsplit count */
838 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000839
840/* Dito, but split at line breaks.
841
842 CRLF is considered to be one line break. Line breaks are not
843 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844
Mark Hammond91a681d2002-08-12 07:21:58 +0000845PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 PyObject *s, /* String to split */
847 int keepends /* If true, line end markers are included */
848 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000849
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850/* Partition a string using a given separator. */
851
852PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 PyObject *s, /* String to partition */
854 PyObject *sep /* String separator */
855 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000856
857/* Partition a string using a given separator, searching from the end of the
858 string. */
859
860PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 PyObject *s, /* String to partition */
862 PyObject *sep /* String separator */
863 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000865/* Split a string giving a list of Unicode strings.
866
867 If sep is NULL, splitting will be done at all whitespace
868 substrings. Otherwise, splits occur at the given separator.
869
870 At most maxsplit splits will be done. But unlike PyUnicode_Split
871 PyUnicode_RSplit splits from the end of the string. If negative,
872 no limit is set.
873
874 Separators are not included in the resulting list.
875
876*/
877
878PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 PyObject *s, /* String to split */
880 PyObject *sep, /* String separator */
881 Py_ssize_t maxsplit /* Maxsplit count */
882 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000883
Guido van Rossumd8225182000-03-10 22:33:05 +0000884/* Translate a string by applying a character mapping table to it and
885 return the resulting Unicode object.
886
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200887 The mapping table must map Unicode ordinal integers to Unicode strings,
888 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +0000889
890 Mapping tables may be dictionaries or sequences. Unmapped character
891 ordinals (ones which cause a LookupError) are left untouched and
892 are copied as-is.
893
894*/
895
Mark Hammond91a681d2002-08-12 07:21:58 +0000896PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 PyObject *str, /* String */
898 PyObject *table, /* Translate table */
899 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000900 );
901
902/* Join a sequence of strings using the given separator and return
903 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904
Mark Hammond91a681d2002-08-12 07:21:58 +0000905PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 PyObject *separator, /* Separator string */
907 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000908 );
909
910/* Return 1 if substr matches str[start:end] at the given tail end, 0
911 otherwise. */
912
Martin v. Löwis18e16552006-02-15 17:27:45 +0000913PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 PyObject *str, /* String */
915 PyObject *substr, /* Prefix or Suffix string */
916 Py_ssize_t start, /* Start index */
917 Py_ssize_t end, /* Stop index */
918 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +0000919 );
920
921/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +0000922 given search direction or -1 if not found. -2 is returned in case
923 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000924
Martin v. Löwis18e16552006-02-15 17:27:45 +0000925PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000926 PyObject *str, /* String */
927 PyObject *substr, /* Substring to find */
928 Py_ssize_t start, /* Start index */
929 Py_ssize_t end, /* Stop index */
930 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +0000931 );
932
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200933#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934/* Like PyUnicode_Find, but search for single character only. */
935PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
936 PyObject *str,
937 Py_UCS4 ch,
938 Py_ssize_t start,
939 Py_ssize_t end,
940 int direction
941 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200942#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943
Barry Warsaw51ac5802000-03-20 16:36:48 +0000944/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000945
Martin v. Löwis18e16552006-02-15 17:27:45 +0000946PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000947 PyObject *str, /* String */
948 PyObject *substr, /* Substring to count */
949 Py_ssize_t start, /* Start index */
950 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +0000951 );
952
Barry Warsaw51ac5802000-03-20 16:36:48 +0000953/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000954 and return the resulting Unicode object. */
955
Mark Hammond91a681d2002-08-12 07:21:58 +0000956PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000957 PyObject *str, /* String */
958 PyObject *substr, /* Substring to find */
959 PyObject *replstr, /* Substring to replace */
960 Py_ssize_t maxcount /* Max. number of replacements to apply;
961 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +0000962 );
963
964/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +0200965 greater than resp.
966 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000967
Mark Hammond91a681d2002-08-12 07:21:58 +0000968PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000969 PyObject *left, /* Left string */
970 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000971 );
972
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200973/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
974 equal, and greater than, respectively. It is best to pass only
975 ASCII-encoded strings, but the function interprets the input string as
976 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +0200977 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200978
Martin v. Löwis5b222132007-06-10 09:51:05 +0000979PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
980 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000981 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +0000982 );
983
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000984/* Rich compare two strings and return one of the following:
985
986 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +0000987 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000988 - Py_NotImplemented in case the type combination is unknown
989
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000990 Possible values for op:
991
992 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
993
994*/
995
996PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000997 PyObject *left, /* Left string */
998 PyObject *right, /* Right string */
999 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001000 );
1001
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001002/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001003 the resulting Unicode string. */
1004
Mark Hammond91a681d2002-08-12 07:21:58 +00001005PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 PyObject *format, /* Format string */
1007 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001008 );
1009
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001010/* Checks whether element is contained in container and return 1/0
1011 accordingly.
1012
Martin Pantercc71a792016-04-05 06:19:42 +00001013 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001014 returned in case of an error. */
1015
Mark Hammond91a681d2002-08-12 07:21:58 +00001016PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 PyObject *container, /* Container string */
1018 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001019 );
1020
Martin v. Löwis47383402007-08-15 07:32:56 +00001021/* Checks whether argument is a valid identifier. */
1022
1023PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1024
Guido van Rossumd8225182000-03-10 22:33:05 +00001025/* === Characters Type APIs =============================================== */
1026
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03001027#ifndef Py_LIMITED_API
Victor Stinner75e46992018-11-26 17:29:38 +01001028# define Py_CPYTHON_UNICODEOBJECT_H
1029# include "cpython/unicodeobject.h"
1030# undef Py_CPYTHON_UNICODEOBJECT_H
1031#endif
Raymond Hettingerac2ef652015-07-04 16:04:44 -07001032
Guido van Rossumd8225182000-03-10 22:33:05 +00001033#ifdef __cplusplus
1034}
1035#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001036#endif /* !Py_UNICODEOBJECT_H */