blob: 500ce242e9f0e850f483695463701aed7d3cce14 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Guido van Rossumd8225182000-03-10 22:33:05 +000086/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000087 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000089
90#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000091# ifndef HAVE_WCHAR_H
92# define HAVE_WCHAR_H
93# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96#ifdef HAVE_WCHAR_H
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000097# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000098#endif
99
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200101 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Barry Warsaw51ac5802000-03-20 16:36:48 +0000106#ifdef __cplusplus
107extern "C" {
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110
Mark Hammond91a681d2002-08-12 07:21:58 +0000111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000113
Thomas Wouters27d517b2007-02-25 20:39:11 +0000114#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
Dong-hee Nad905df72020-02-14 02:37:17 +0900116#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000117
Guido van Rossumd8225182000-03-10 22:33:05 +0000118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000126
127/* === Public API ========================================================= */
128
Georg Brandl952867a2010-06-27 10:17:12 +0000129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000131 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000132 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000133 );
134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000138 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000139 );
140
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200146#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200149/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300150 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200164#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200165
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000167/* Get the length of the Unicode object. */
168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200172#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200173
Victor Stinner157f83f2011-09-28 21:41:31 +0200174/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200175 string representation. */
176
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600179 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000180
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200199#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200200
Martin Panter6245cb32016-04-15 02:14:19 +0000201/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000210
211 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000216
Mark Hammond91a681d2002-08-12 07:21:58 +0000217PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000220 );
221
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300222/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000223
Martin Panter20d32552016-04-15 00:56:21 +0000224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000227
Martin Panter20d32552016-04-15 00:56:21 +0000228 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233*/
234
Mark Hammond91a681d2002-08-12 07:21:58 +0000235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200236 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
Martin Panter20d32552016-04-15 00:56:21 +0000241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248*/
249
Mark Hammond91a681d2002-08-12 07:21:58 +0000250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200251 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000252 );
253
Victor Stinner1205f272010-09-11 00:54:47 +0000254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000262
Walter Dörwald16807132007-05-25 13:52:07 +0000263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000265PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266 const char *u /* UTF-8 encoded string */
267 );
Walter Dörwald16807132007-05-25 13:52:07 +0000268
269/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200270#define PyUnicode_CHECK_INTERNED(op) \
271 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000272
Guido van Rossumd8225182000-03-10 22:33:05 +0000273/* --- wchar_t support for platforms which support it --------------------- */
274
275#ifdef HAVE_WCHAR_H
276
Georg Brandl952867a2010-06-27 10:17:12 +0000277/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000278 size.
279
280 The buffer is copied into the new object. */
281
Mark Hammond91a681d2002-08-12 07:21:58 +0000282PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200283 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000284 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000285 );
286
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000287/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000288 most size wchar_t characters are copied.
289
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000290 Note that the resulting wchar_t string may or may not be
291 0-terminated. It is the responsibility of the caller to make sure
292 that the wchar_t string is 0-terminated in case this is required by
293 the application.
294
295 Returns the number of wchar_t characters copied (excluding a
296 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000297 error. */
298
Martin v. Löwis18e16552006-02-15 17:27:45 +0000299PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000300 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200301 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000302 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000303 );
304
Victor Stinner137c34c2010-09-29 10:25:54 +0000305/* Convert the Unicode object to a wide character string. The output string
306 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200307 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000308
Victor Stinner22fabe22015-02-11 18:17:56 +0100309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +0000310 on success. On error, returns NULL, *size is undefined and raises a
311 MemoryError. */
312
313PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000314 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000315 Py_ssize_t *size /* number of characters of the result */
316 );
317
Guido van Rossumd8225182000-03-10 22:33:05 +0000318#endif
319
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000320/* --- Unicode ordinals --------------------------------------------------- */
321
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322/* Create a Unicode Object from the given Unicode code point ordinal.
323
Ezio Melottie7f90372012-10-05 03:33:31 +0300324 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000325 raised in case it is not.
326
327*/
328
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000329PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000330
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000332
333 Many of these APIs take two arguments encoding and errors. These
334 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000335 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000336
Georg Brandl952867a2010-06-27 10:17:12 +0000337 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000338
339 Error handling is set by errors which may also be set to NULL
340 meaning to use the default handling defined for the codec. Default
341 error handling for all builtin codecs is "strict" (ValueErrors are
342 raised).
343
344 The codecs all use a similar interface. Only deviation from the
345 generic ones are documented.
346
347*/
348
Fred Drakecb093fe2000-05-09 19:51:53 +0000349/* --- Manage the default encoding ---------------------------------------- */
350
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000351/* Returns "utf-8". */
Mark Hammond91a681d2002-08-12 07:21:58 +0000352PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354/* --- Generic Codecs ----------------------------------------------------- */
355
356/* Create a Unicode object by decoding the encoded string s of the
357 given size. */
358
Mark Hammond91a681d2002-08-12 07:21:58 +0000359PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000360 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000361 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000362 const char *encoding, /* encoding */
363 const char *errors /* error handling */
364 );
365
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000366/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300367 object.
368
369 This API is DEPRECATED. The only supported standard encoding is rot13.
370 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
371 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000372
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600373Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000374 PyObject *unicode, /* Unicode object */
375 const char *encoding, /* encoding */
376 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600377 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000378
379/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300380 object.
381
382 This API is DEPRECATED. The only supported standard encoding is rot13.
383 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
384 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000385
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600386Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 PyObject *unicode, /* Unicode object */
388 const char *encoding, /* encoding */
389 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600390 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000391
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000392/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300393 object.
394
Ville Skyttä49b27342017-08-03 09:00:59 +0300395 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
Serhiy Storchaka00939072016-10-27 21:05:49 +0300396 since all standard encodings (except rot13) encode str to bytes.
397 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
398 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000399
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600400Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 PyObject *unicode, /* Unicode object */
402 const char *encoding, /* encoding */
403 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600404 );
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000405
Guido van Rossumd8225182000-03-10 22:33:05 +0000406/* Encodes a Unicode object and returns the result as Python string
407 object. */
408
Mark Hammond91a681d2002-08-12 07:21:58 +0000409PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000410 PyObject *unicode, /* Unicode object */
411 const char *encoding, /* encoding */
412 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000413 );
414
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000415/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300416 object.
417
418 This API is DEPRECATED. The only supported standard encodings is rot13.
419 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
420 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000421
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600422Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 PyObject *unicode, /* Unicode object */
424 const char *encoding, /* encoding */
425 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600426 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000427
428/* Build an encoding map. */
429
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000430PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
431 PyObject* string /* 256 character map */
432 );
433
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000434/* --- UTF-7 Codecs ------------------------------------------------------- */
435
Mark Hammond91a681d2002-08-12 07:21:58 +0000436PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000437 const char *string, /* UTF-7 encoded string */
438 Py_ssize_t length, /* size of string */
439 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000440 );
441
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000442PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000443 const char *string, /* UTF-7 encoded string */
444 Py_ssize_t length, /* size of string */
445 const char *errors, /* error handling */
446 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000447 );
448
Guido van Rossumd8225182000-03-10 22:33:05 +0000449/* --- UTF-8 Codecs ------------------------------------------------------- */
450
Mark Hammond91a681d2002-08-12 07:21:58 +0000451PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 const char *string, /* UTF-8 encoded string */
453 Py_ssize_t length, /* size of string */
454 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000455 );
456
Walter Dörwald69652032004-09-07 20:24:22 +0000457PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 const char *string, /* UTF-8 encoded string */
459 Py_ssize_t length, /* size of string */
460 const char *errors, /* error handling */
461 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000462 );
463
Mark Hammond91a681d2002-08-12 07:21:58 +0000464PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000466 );
467
Walter Dörwald41980ca2007-08-16 21:55:45 +0000468/* --- UTF-32 Codecs ------------------------------------------------------ */
469
470/* Decodes length bytes from a UTF-32 encoded buffer string and returns
471 the corresponding Unicode object.
472
473 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000475
476 If byteorder is non-NULL, the decoder starts decoding using the
477 given byte order:
478
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 *byteorder == -1: little endian
480 *byteorder == 0: native order
481 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482
483 In native mode, the first four bytes of the stream are checked for a
484 BOM mark. If found, the BOM mark is analysed, the byte order
485 adjusted and the BOM skipped. In the other modes, no BOM mark
486 interpretation is done. After completion, *byteorder is set to the
487 current byte order at the end of input data.
488
489 If byteorder is NULL, the codec starts in native order mode.
490
491*/
492
493PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000494 const char *string, /* UTF-32 encoded string */
495 Py_ssize_t length, /* size of string */
496 const char *errors, /* error handling */
497 int *byteorder /* pointer to byteorder to use
498 0=native;-1=LE,1=BE; updated on
499 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000500 );
501
502PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 const char *string, /* UTF-32 encoded string */
504 Py_ssize_t length, /* size of string */
505 const char *errors, /* error handling */
506 int *byteorder, /* pointer to byteorder to use
507 0=native;-1=LE,1=BE; updated on
508 exit */
509 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000510 );
511
512/* Returns a Python string using the UTF-32 encoding in native byte
513 order. The string always starts with a BOM mark. */
514
515PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000516 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000517 );
518
519/* Returns a Python string object holding the UTF-32 encoded value of
520 the Unicode data.
521
522 If byteorder is not 0, output is written according to the following
523 byte order:
524
525 byteorder == -1: little endian
526 byteorder == 0: native byte order (writes a BOM mark)
527 byteorder == 1: big endian
528
529 If byteorder is 0, the output string will always start with the
530 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
531 prepended.
532
533*/
534
Guido van Rossumd8225182000-03-10 22:33:05 +0000535/* --- UTF-16 Codecs ------------------------------------------------------ */
536
Guido van Rossum9e896b32000-04-05 20:11:21 +0000537/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000538 the corresponding Unicode object.
539
540 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000542
543 If byteorder is non-NULL, the decoder starts decoding using the
544 given byte order:
545
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 *byteorder == -1: little endian
547 *byteorder == 0: native order
548 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000549
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000550 In native mode, the first two bytes of the stream are checked for a
551 BOM mark. If found, the BOM mark is analysed, the byte order
552 adjusted and the BOM skipped. In the other modes, no BOM mark
553 interpretation is done. After completion, *byteorder is set to the
554 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000555
556 If byteorder is NULL, the codec starts in native order mode.
557
558*/
559
Mark Hammond91a681d2002-08-12 07:21:58 +0000560PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 const char *string, /* UTF-16 encoded string */
562 Py_ssize_t length, /* size of string */
563 const char *errors, /* error handling */
564 int *byteorder /* pointer to byteorder to use
565 0=native;-1=LE,1=BE; updated on
566 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000567 );
568
Walter Dörwald69652032004-09-07 20:24:22 +0000569PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000570 const char *string, /* UTF-16 encoded string */
571 Py_ssize_t length, /* size of string */
572 const char *errors, /* error handling */
573 int *byteorder, /* pointer to byteorder to use
574 0=native;-1=LE,1=BE; updated on
575 exit */
576 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000577 );
578
Guido van Rossumd8225182000-03-10 22:33:05 +0000579/* Returns a Python string using the UTF-16 encoding in native byte
580 order. The string always starts with a BOM mark. */
581
Mark Hammond91a681d2002-08-12 07:21:58 +0000582PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000583 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000584 );
585
Guido van Rossumd8225182000-03-10 22:33:05 +0000586/* --- Unicode-Escape Codecs ---------------------------------------------- */
587
Mark Hammond91a681d2002-08-12 07:21:58 +0000588PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000589 const char *string, /* Unicode-Escape encoded string */
590 Py_ssize_t length, /* size of string */
591 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000592 );
593
Mark Hammond91a681d2002-08-12 07:21:58 +0000594PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000595 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000596 );
597
Guido van Rossumd8225182000-03-10 22:33:05 +0000598/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
599
Mark Hammond91a681d2002-08-12 07:21:58 +0000600PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 const char *string, /* Raw-Unicode-Escape encoded string */
602 Py_ssize_t length, /* size of string */
603 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000604 );
605
Mark Hammond91a681d2002-08-12 07:21:58 +0000606PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000608 );
609
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000611
Victor Stinner75e46992018-11-26 17:29:38 +0100612 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000613
Mark Hammond91a681d2002-08-12 07:21:58 +0000614PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 const char *string, /* Latin-1 encoded string */
616 Py_ssize_t length, /* size of string */
617 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000618 );
619
Mark Hammond91a681d2002-08-12 07:21:58 +0000620PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000622 );
623
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000625
626 Only 7-bit ASCII data is excepted. All other codes generate errors.
627
628*/
629
Mark Hammond91a681d2002-08-12 07:21:58 +0000630PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 const char *string, /* ASCII encoded string */
632 Py_ssize_t length, /* size of string */
633 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000634 );
635
Mark Hammond91a681d2002-08-12 07:21:58 +0000636PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000638 );
639
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000641
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +0000643
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200644 Decoding mappings must map byte ordinals (integers in the range from 0 to
645 255) to Unicode strings, integers (which are then interpreted as Unicode
646 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
647 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
648 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000649
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200650 Encoding mappings must map Unicode ordinal integers to bytes objects,
651 integers in the range from 0 to 255 or None. Unmapped character
652 ordinals (ones which cause a LookupError) as well as mapped to
653 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000654
655*/
656
Mark Hammond91a681d2002-08-12 07:21:58 +0000657PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 const char *string, /* Encoded string */
659 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200660 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000662 );
663
Mark Hammond91a681d2002-08-12 07:21:58 +0000664PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200666 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +0000667 );
668
Guido van Rossumefec1152000-03-28 02:01:15 +0000669/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000670
Victor Stinner75e46992018-11-26 17:29:38 +0100671#ifdef MS_WINDOWS
Mark Hammond91a681d2002-08-12 07:21:58 +0000672PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000673 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -0700674 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +0000675 const char *errors /* error handling */
676 );
677
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000678PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
679 const char *string, /* MBCS encoded string */
680 Py_ssize_t length, /* size of string */
681 const char *errors, /* error handling */
682 Py_ssize_t *consumed /* bytes consumed */
683 );
684
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200685#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200686PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
687 int code_page, /* code page number */
688 const char *string, /* encoded string */
689 Py_ssize_t length, /* size of string */
690 const char *errors, /* error handling */
691 Py_ssize_t *consumed /* bytes consumed */
692 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200693#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200694
Mark Hammond91a681d2002-08-12 07:21:58 +0000695PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +0000696 PyObject *unicode /* Unicode object */
697 );
698
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200699#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200700PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
701 int code_page, /* code page number */
702 PyObject *unicode, /* Unicode object */
703 const char *errors /* error handling */
704 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200705#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200706
Steve Dowercc16be82016-09-08 10:35:16 -0700707#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000708
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100709/* --- Locale encoding --------------------------------------------------- */
710
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200711#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100712/* Decode a string from the current locale encoding. The decoder is strict if
713 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
714 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
715 be decoded as a surrogate character and *surrogateescape* is not equal to
716 zero, the byte sequence is escaped using the 'surrogateescape' error handler
717 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100718 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100719
720PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
721 const char *str,
722 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +0100723 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100724
725/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
726 length using strlen(). */
727
728PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
729 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +0100730 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100731
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100732/* Encode a Unicode object to the current locale encoding. The encoder is
733 strict is *surrogateescape* is equal to zero, otherwise the
734 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +0100735 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100736
737PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
738 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +0100739 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100740 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200741#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100742
Martin v. Löwis011e8422009-05-05 04:43:17 +0000743/* --- File system encoding ---------------------------------------------- */
744
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000745/* ParseTuple converter: encode str objects to bytes using
746 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +0000747
748PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
749
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000750/* ParseTuple converter: decode bytes objects to unicode using
751 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
752
753PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
754
Victor Stinner77c38622010-05-14 15:58:55 +0000755/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
756 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000757
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000758 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
759 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000760
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000761 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000762*/
763
764PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
765 const char *s /* encoded string */
766 );
767
Victor Stinner77c38622010-05-14 15:58:55 +0000768/* Decode a string using Py_FileSystemDefaultEncoding
769 and the "surrogateescape" error handler.
770
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000771 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
772 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +0000773*/
774
Martin v. Löwis011e8422009-05-05 04:43:17 +0000775PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
776 const char *s, /* encoded string */
777 Py_ssize_t size /* size */
778 );
779
Victor Stinnerae6265f2010-05-15 16:27:27 +0000780/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000781 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000782
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000783 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
784 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000785*/
786
787PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
788 PyObject *unicode
789 );
790
Guido van Rossumd8225182000-03-10 22:33:05 +0000791/* --- Methods & Slots ----------------------------------------------------
792
793 These are capable of handling Unicode objects and strings on input
794 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200795 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000796
797/* Concat two strings giving a new Unicode string. */
798
Mark Hammond91a681d2002-08-12 07:21:58 +0000799PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 PyObject *left, /* Left string */
801 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000802 );
803
Walter Dörwald1ab83302007-05-18 17:15:44 +0000804/* Concat two strings and put the result in *pleft
805 (sets *pleft to NULL on error) */
806
807PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 PyObject **pleft, /* Pointer to left string */
809 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000810 );
811
812/* Concat two strings, put the result in *pleft and drop the right object
813 (sets *pleft to NULL on error) */
814
815PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 PyObject **pleft, /* Pointer to left string */
817 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000818 );
819
Guido van Rossumd8225182000-03-10 22:33:05 +0000820/* Split a string giving a list of Unicode strings.
821
822 If sep is NULL, splitting will be done at all whitespace
823 substrings. Otherwise, splits occur at the given separator.
824
825 At most maxsplit splits will be done. If negative, no limit is set.
826
827 Separators are not included in the resulting list.
828
829*/
830
Mark Hammond91a681d2002-08-12 07:21:58 +0000831PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 PyObject *s, /* String to split */
833 PyObject *sep, /* String separator */
834 Py_ssize_t maxsplit /* Maxsplit count */
835 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000836
837/* Dito, but split at line breaks.
838
839 CRLF is considered to be one line break. Line breaks are not
840 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841
Mark Hammond91a681d2002-08-12 07:21:58 +0000842PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 PyObject *s, /* String to split */
844 int keepends /* If true, line end markers are included */
845 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* Partition a string using a given separator. */
848
849PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 PyObject *s, /* String to partition */
851 PyObject *sep /* String separator */
852 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
854/* Partition a string using a given separator, searching from the end of the
855 string. */
856
857PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000858 PyObject *s, /* String to partition */
859 PyObject *sep /* String separator */
860 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000862/* Split a string giving a list of Unicode strings.
863
864 If sep is NULL, splitting will be done at all whitespace
865 substrings. Otherwise, splits occur at the given separator.
866
867 At most maxsplit splits will be done. But unlike PyUnicode_Split
868 PyUnicode_RSplit splits from the end of the string. If negative,
869 no limit is set.
870
871 Separators are not included in the resulting list.
872
873*/
874
875PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 PyObject *s, /* String to split */
877 PyObject *sep, /* String separator */
878 Py_ssize_t maxsplit /* Maxsplit count */
879 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000880
Guido van Rossumd8225182000-03-10 22:33:05 +0000881/* Translate a string by applying a character mapping table to it and
882 return the resulting Unicode object.
883
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200884 The mapping table must map Unicode ordinal integers to Unicode strings,
885 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +0000886
887 Mapping tables may be dictionaries or sequences. Unmapped character
888 ordinals (ones which cause a LookupError) are left untouched and
889 are copied as-is.
890
891*/
892
Mark Hammond91a681d2002-08-12 07:21:58 +0000893PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894 PyObject *str, /* String */
895 PyObject *table, /* Translate table */
896 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000897 );
898
899/* Join a sequence of strings using the given separator and return
900 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901
Mark Hammond91a681d2002-08-12 07:21:58 +0000902PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903 PyObject *separator, /* Separator string */
904 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000905 );
906
907/* Return 1 if substr matches str[start:end] at the given tail end, 0
908 otherwise. */
909
Martin v. Löwis18e16552006-02-15 17:27:45 +0000910PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 PyObject *str, /* String */
912 PyObject *substr, /* Prefix or Suffix string */
913 Py_ssize_t start, /* Start index */
914 Py_ssize_t end, /* Stop index */
915 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +0000916 );
917
918/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +0000919 given search direction or -1 if not found. -2 is returned in case
920 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000921
Martin v. Löwis18e16552006-02-15 17:27:45 +0000922PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 PyObject *str, /* String */
924 PyObject *substr, /* Substring to find */
925 Py_ssize_t start, /* Start index */
926 Py_ssize_t end, /* Stop index */
927 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +0000928 );
929
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200930#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931/* Like PyUnicode_Find, but search for single character only. */
932PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
933 PyObject *str,
934 Py_UCS4 ch,
935 Py_ssize_t start,
936 Py_ssize_t end,
937 int direction
938 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200939#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Barry Warsaw51ac5802000-03-20 16:36:48 +0000941/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000942
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000944 PyObject *str, /* String */
945 PyObject *substr, /* Substring to count */
946 Py_ssize_t start, /* Start index */
947 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +0000948 );
949
Barry Warsaw51ac5802000-03-20 16:36:48 +0000950/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000951 and return the resulting Unicode object. */
952
Mark Hammond91a681d2002-08-12 07:21:58 +0000953PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000954 PyObject *str, /* String */
955 PyObject *substr, /* Substring to find */
956 PyObject *replstr, /* Substring to replace */
957 Py_ssize_t maxcount /* Max. number of replacements to apply;
958 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +0000959 );
960
961/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +0200962 greater than resp.
963 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000964
Mark Hammond91a681d2002-08-12 07:21:58 +0000965PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 PyObject *left, /* Left string */
967 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000968 );
969
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200970/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
971 equal, and greater than, respectively. It is best to pass only
972 ASCII-encoded strings, but the function interprets the input string as
973 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +0200974 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200975
Martin v. Löwis5b222132007-06-10 09:51:05 +0000976PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
977 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000978 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +0000979 );
980
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000981/* Rich compare two strings and return one of the following:
982
983 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +0000984 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000985 - Py_NotImplemented in case the type combination is unknown
986
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000987 Possible values for op:
988
989 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
990
991*/
992
993PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000994 PyObject *left, /* Left string */
995 PyObject *right, /* Right string */
996 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000997 );
998
Serhiy Storchakad65c9492015-11-02 14:10:23 +0200999/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001000 the resulting Unicode string. */
1001
Mark Hammond91a681d2002-08-12 07:21:58 +00001002PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001003 PyObject *format, /* Format string */
1004 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001005 );
1006
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001007/* Checks whether element is contained in container and return 1/0
1008 accordingly.
1009
Martin Pantercc71a792016-04-05 06:19:42 +00001010 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001011 returned in case of an error. */
1012
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 PyObject *container, /* Container string */
1015 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001016 );
1017
Martin v. Löwis47383402007-08-15 07:32:56 +00001018/* Checks whether argument is a valid identifier. */
1019
1020PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1021
Guido van Rossumd8225182000-03-10 22:33:05 +00001022/* === Characters Type APIs =============================================== */
1023
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03001024#ifndef Py_LIMITED_API
Victor Stinner75e46992018-11-26 17:29:38 +01001025# define Py_CPYTHON_UNICODEOBJECT_H
1026# include "cpython/unicodeobject.h"
1027# undef Py_CPYTHON_UNICODEOBJECT_H
1028#endif
Raymond Hettingerac2ef652015-07-04 16:04:44 -07001029
Guido van Rossumd8225182000-03-10 22:33:05 +00001030#ifdef __cplusplus
1031}
1032#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001033#endif /* !Py_UNICODEOBJECT_H */