blob: 6d141b37bf8909204454f73c8681c14e31f612b8 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Guido van Rossumd8225182000-03-10 22:33:05 +000086/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000087 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000089
90#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000091# ifndef HAVE_WCHAR_H
92# define HAVE_WCHAR_H
93# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96#ifdef HAVE_WCHAR_H
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000097# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000098#endif
99
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200101 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Barry Warsaw51ac5802000-03-20 16:36:48 +0000106#ifdef __cplusplus
107extern "C" {
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110
Mark Hammond91a681d2002-08-12 07:21:58 +0000111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000113
Thomas Wouters27d517b2007-02-25 20:39:11 +0000114#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000117
Guido van Rossumd8225182000-03-10 22:33:05 +0000118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000126
127/* === Public API ========================================================= */
128
Georg Brandl952867a2010-06-27 10:17:12 +0000129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000131 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000132 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000133 );
134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000138 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000139 );
140
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200146#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200149/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300150 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200164#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200165
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000167/* Get the length of the Unicode object. */
168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200172#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200173
Victor Stinner157f83f2011-09-28 21:41:31 +0200174/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200175 string representation. */
176
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600179 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000180
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200199#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200200
Martin Panter6245cb32016-04-15 02:14:19 +0000201/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000210
211 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000216
Mark Hammond91a681d2002-08-12 07:21:58 +0000217PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000220 );
221
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300222/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000223
Martin Panter20d32552016-04-15 00:56:21 +0000224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000227
Martin Panter20d32552016-04-15 00:56:21 +0000228 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233*/
234
Mark Hammond91a681d2002-08-12 07:21:58 +0000235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200236 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
Martin Panter20d32552016-04-15 00:56:21 +0000241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248*/
249
Mark Hammond91a681d2002-08-12 07:21:58 +0000250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200251 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000252 );
253
Victor Stinner1205f272010-09-11 00:54:47 +0000254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000262
Walter Dörwald16807132007-05-25 13:52:07 +0000263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000265PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266 const char *u /* UTF-8 encoded string */
267 );
Walter Dörwald16807132007-05-25 13:52:07 +0000268
269/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200270#define PyUnicode_CHECK_INTERNED(op) \
271 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000272
Guido van Rossumd8225182000-03-10 22:33:05 +0000273/* --- wchar_t support for platforms which support it --------------------- */
274
275#ifdef HAVE_WCHAR_H
276
Georg Brandl952867a2010-06-27 10:17:12 +0000277/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000278 size.
279
280 The buffer is copied into the new object. */
281
Mark Hammond91a681d2002-08-12 07:21:58 +0000282PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200283 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000284 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000285 );
286
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000287/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000288 most size wchar_t characters are copied.
289
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000290 Note that the resulting wchar_t string may or may not be
291 0-terminated. It is the responsibility of the caller to make sure
292 that the wchar_t string is 0-terminated in case this is required by
293 the application.
294
295 Returns the number of wchar_t characters copied (excluding a
296 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000297 error. */
298
Martin v. Löwis18e16552006-02-15 17:27:45 +0000299PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000300 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200301 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000302 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000303 );
304
Victor Stinner137c34c2010-09-29 10:25:54 +0000305/* Convert the Unicode object to a wide character string. The output string
306 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200307 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000308
Victor Stinner22fabe22015-02-11 18:17:56 +0100309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +0000310 on success. On error, returns NULL, *size is undefined and raises a
311 MemoryError. */
312
313PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000314 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000315 Py_ssize_t *size /* number of characters of the result */
316 );
317
Guido van Rossumd8225182000-03-10 22:33:05 +0000318#endif
319
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000320/* --- Unicode ordinals --------------------------------------------------- */
321
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322/* Create a Unicode Object from the given Unicode code point ordinal.
323
Ezio Melottie7f90372012-10-05 03:33:31 +0300324 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000325 raised in case it is not.
326
327*/
328
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000329PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000330
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000331/* --- Free-list management ----------------------------------------------- */
332
333/* Clear the free list used by the Unicode implementation.
334
335 This can be used to release memory used for objects on the free
336 list back to the Python memory allocator.
337
338*/
339
340PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
341
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000342/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000343
344 Many of these APIs take two arguments encoding and errors. These
345 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000346 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000347
Georg Brandl952867a2010-06-27 10:17:12 +0000348 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000349
350 Error handling is set by errors which may also be set to NULL
351 meaning to use the default handling defined for the codec. Default
352 error handling for all builtin codecs is "strict" (ValueErrors are
353 raised).
354
355 The codecs all use a similar interface. Only deviation from the
356 generic ones are documented.
357
358*/
359
Fred Drakecb093fe2000-05-09 19:51:53 +0000360/* --- Manage the default encoding ---------------------------------------- */
361
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000362/* Returns "utf-8". */
Mark Hammond91a681d2002-08-12 07:21:58 +0000363PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000364
Guido van Rossumd8225182000-03-10 22:33:05 +0000365/* --- Generic Codecs ----------------------------------------------------- */
366
367/* Create a Unicode object by decoding the encoded string s of the
368 given size. */
369
Mark Hammond91a681d2002-08-12 07:21:58 +0000370PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000371 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000372 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000373 const char *encoding, /* encoding */
374 const char *errors /* error handling */
375 );
376
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000377/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300378 object.
379
380 This API is DEPRECATED. The only supported standard encoding is rot13.
381 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
382 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000383
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600384Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000385 PyObject *unicode, /* Unicode object */
386 const char *encoding, /* encoding */
387 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600388 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000389
390/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300391 object.
392
393 This API is DEPRECATED. The only supported standard encoding is rot13.
394 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
395 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000396
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600397Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 PyObject *unicode, /* Unicode object */
399 const char *encoding, /* encoding */
400 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600401 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000402
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000403/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +0300404 object.
405
Ville Skyttä49b27342017-08-03 09:00:59 +0300406 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
Serhiy Storchaka00939072016-10-27 21:05:49 +0300407 since all standard encodings (except rot13) encode str to bytes.
408 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
409 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000410
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600411Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000412 PyObject *unicode, /* Unicode object */
413 const char *encoding, /* encoding */
414 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600415 );
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000416
Guido van Rossumd8225182000-03-10 22:33:05 +0000417/* Encodes a Unicode object and returns the result as Python string
418 object. */
419
Mark Hammond91a681d2002-08-12 07:21:58 +0000420PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 PyObject *unicode, /* Unicode object */
422 const char *encoding, /* encoding */
423 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000424 );
425
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000426/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +0300427 object.
428
429 This API is DEPRECATED. The only supported standard encodings is rot13.
430 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
431 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000432
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600433Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000434 PyObject *unicode, /* Unicode object */
435 const char *encoding, /* encoding */
436 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600437 );
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000438
439/* Build an encoding map. */
440
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000441PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
442 PyObject* string /* 256 character map */
443 );
444
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000445/* --- UTF-7 Codecs ------------------------------------------------------- */
446
Mark Hammond91a681d2002-08-12 07:21:58 +0000447PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000448 const char *string, /* UTF-7 encoded string */
449 Py_ssize_t length, /* size of string */
450 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000451 );
452
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000453PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000454 const char *string, /* UTF-7 encoded string */
455 Py_ssize_t length, /* size of string */
456 const char *errors, /* error handling */
457 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000458 );
459
Guido van Rossumd8225182000-03-10 22:33:05 +0000460/* --- UTF-8 Codecs ------------------------------------------------------- */
461
Mark Hammond91a681d2002-08-12 07:21:58 +0000462PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 const char *string, /* UTF-8 encoded string */
464 Py_ssize_t length, /* size of string */
465 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000466 );
467
Walter Dörwald69652032004-09-07 20:24:22 +0000468PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 const char *string, /* UTF-8 encoded string */
470 Py_ssize_t length, /* size of string */
471 const char *errors, /* error handling */
472 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000473 );
474
Mark Hammond91a681d2002-08-12 07:21:58 +0000475PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000477 );
478
Walter Dörwald41980ca2007-08-16 21:55:45 +0000479/* --- UTF-32 Codecs ------------------------------------------------------ */
480
481/* Decodes length bytes from a UTF-32 encoded buffer string and returns
482 the corresponding Unicode object.
483
484 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000485 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000486
487 If byteorder is non-NULL, the decoder starts decoding using the
488 given byte order:
489
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000490 *byteorder == -1: little endian
491 *byteorder == 0: native order
492 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000493
494 In native mode, the first four bytes of the stream are checked for a
495 BOM mark. If found, the BOM mark is analysed, the byte order
496 adjusted and the BOM skipped. In the other modes, no BOM mark
497 interpretation is done. After completion, *byteorder is set to the
498 current byte order at the end of input data.
499
500 If byteorder is NULL, the codec starts in native order mode.
501
502*/
503
504PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505 const char *string, /* UTF-32 encoded string */
506 Py_ssize_t length, /* size of string */
507 const char *errors, /* error handling */
508 int *byteorder /* pointer to byteorder to use
509 0=native;-1=LE,1=BE; updated on
510 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000511 );
512
513PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514 const char *string, /* UTF-32 encoded string */
515 Py_ssize_t length, /* size of string */
516 const char *errors, /* error handling */
517 int *byteorder, /* pointer to byteorder to use
518 0=native;-1=LE,1=BE; updated on
519 exit */
520 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 );
522
523/* Returns a Python string using the UTF-32 encoding in native byte
524 order. The string always starts with a BOM mark. */
525
526PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000527 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000528 );
529
530/* Returns a Python string object holding the UTF-32 encoded value of
531 the Unicode data.
532
533 If byteorder is not 0, output is written according to the following
534 byte order:
535
536 byteorder == -1: little endian
537 byteorder == 0: native byte order (writes a BOM mark)
538 byteorder == 1: big endian
539
540 If byteorder is 0, the output string will always start with the
541 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
542 prepended.
543
544*/
545
Guido van Rossumd8225182000-03-10 22:33:05 +0000546/* --- UTF-16 Codecs ------------------------------------------------------ */
547
Guido van Rossum9e896b32000-04-05 20:11:21 +0000548/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000549 the corresponding Unicode object.
550
551 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000552 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000553
554 If byteorder is non-NULL, the decoder starts decoding using the
555 given byte order:
556
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 *byteorder == -1: little endian
558 *byteorder == 0: native order
559 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000560
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000561 In native mode, the first two bytes of the stream are checked for a
562 BOM mark. If found, the BOM mark is analysed, the byte order
563 adjusted and the BOM skipped. In the other modes, no BOM mark
564 interpretation is done. After completion, *byteorder is set to the
565 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000566
567 If byteorder is NULL, the codec starts in native order mode.
568
569*/
570
Mark Hammond91a681d2002-08-12 07:21:58 +0000571PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000572 const char *string, /* UTF-16 encoded string */
573 Py_ssize_t length, /* size of string */
574 const char *errors, /* error handling */
575 int *byteorder /* pointer to byteorder to use
576 0=native;-1=LE,1=BE; updated on
577 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000578 );
579
Walter Dörwald69652032004-09-07 20:24:22 +0000580PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000581 const char *string, /* UTF-16 encoded string */
582 Py_ssize_t length, /* size of string */
583 const char *errors, /* error handling */
584 int *byteorder, /* pointer to byteorder to use
585 0=native;-1=LE,1=BE; updated on
586 exit */
587 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000588 );
589
Guido van Rossumd8225182000-03-10 22:33:05 +0000590/* Returns a Python string using the UTF-16 encoding in native byte
591 order. The string always starts with a BOM mark. */
592
Mark Hammond91a681d2002-08-12 07:21:58 +0000593PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000594 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000595 );
596
Guido van Rossumd8225182000-03-10 22:33:05 +0000597/* --- Unicode-Escape Codecs ---------------------------------------------- */
598
Mark Hammond91a681d2002-08-12 07:21:58 +0000599PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000600 const char *string, /* Unicode-Escape encoded string */
601 Py_ssize_t length, /* size of string */
602 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000603 );
604
Mark Hammond91a681d2002-08-12 07:21:58 +0000605PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000606 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000607 );
608
Guido van Rossumd8225182000-03-10 22:33:05 +0000609/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
610
Mark Hammond91a681d2002-08-12 07:21:58 +0000611PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000612 const char *string, /* Raw-Unicode-Escape encoded string */
613 Py_ssize_t length, /* size of string */
614 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000615 );
616
Mark Hammond91a681d2002-08-12 07:21:58 +0000617PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000618 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000619 );
620
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000622
Victor Stinner75e46992018-11-26 17:29:38 +0100623 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000624
Mark Hammond91a681d2002-08-12 07:21:58 +0000625PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 const char *string, /* Latin-1 encoded string */
627 Py_ssize_t length, /* size of string */
628 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000629 );
630
Mark Hammond91a681d2002-08-12 07:21:58 +0000631PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000633 );
634
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000636
637 Only 7-bit ASCII data is excepted. All other codes generate errors.
638
639*/
640
Mark Hammond91a681d2002-08-12 07:21:58 +0000641PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 const char *string, /* ASCII encoded string */
643 Py_ssize_t length, /* size of string */
644 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000645 );
646
Mark Hammond91a681d2002-08-12 07:21:58 +0000647PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000649 );
650
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +0000652
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +0000654
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200655 Decoding mappings must map byte ordinals (integers in the range from 0 to
656 255) to Unicode strings, integers (which are then interpreted as Unicode
657 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
658 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
659 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000660
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200661 Encoding mappings must map Unicode ordinal integers to bytes objects,
662 integers in the range from 0 to 255 or None. Unmapped character
663 ordinals (ones which cause a LookupError) as well as mapped to
664 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +0000665
666*/
667
Mark Hammond91a681d2002-08-12 07:21:58 +0000668PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 const char *string, /* Encoded string */
670 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200671 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000673 );
674
Mark Hammond91a681d2002-08-12 07:21:58 +0000675PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200677 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +0000678 );
679
Guido van Rossumefec1152000-03-28 02:01:15 +0000680/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000681
Victor Stinner75e46992018-11-26 17:29:38 +0100682#ifdef MS_WINDOWS
Mark Hammond91a681d2002-08-12 07:21:58 +0000683PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000684 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -0700685 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +0000686 const char *errors /* error handling */
687 );
688
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000689PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
690 const char *string, /* MBCS encoded string */
691 Py_ssize_t length, /* size of string */
692 const char *errors, /* error handling */
693 Py_ssize_t *consumed /* bytes consumed */
694 );
695
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200696#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200697PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
698 int code_page, /* code page number */
699 const char *string, /* encoded string */
700 Py_ssize_t length, /* size of string */
701 const char *errors, /* error handling */
702 Py_ssize_t *consumed /* bytes consumed */
703 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200704#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200705
Mark Hammond91a681d2002-08-12 07:21:58 +0000706PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +0000707 PyObject *unicode /* Unicode object */
708 );
709
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200710#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +0200711PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
712 int code_page, /* code page number */
713 PyObject *unicode, /* Unicode object */
714 const char *errors /* error handling */
715 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200716#endif
Victor Stinner3a50e702011-10-18 21:21:00 +0200717
Steve Dowercc16be82016-09-08 10:35:16 -0700718#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000719
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100720/* --- Locale encoding --------------------------------------------------- */
721
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200722#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100723/* Decode a string from the current locale encoding. The decoder is strict if
724 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
725 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
726 be decoded as a surrogate character and *surrogateescape* is not equal to
727 zero, the byte sequence is escaped using the 'surrogateescape' error handler
728 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100729 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100730
731PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
732 const char *str,
733 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +0100734 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100735
736/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
737 length using strlen(). */
738
739PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
740 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +0100741 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100742
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100743/* Encode a Unicode object to the current locale encoding. The encoder is
744 strict is *surrogateescape* is equal to zero, otherwise the
745 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +0100746 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100747
748PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
749 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +0100750 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100751 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200752#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +0100753
Martin v. Löwis011e8422009-05-05 04:43:17 +0000754/* --- File system encoding ---------------------------------------------- */
755
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000756/* ParseTuple converter: encode str objects to bytes using
757 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +0000758
759PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
760
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000761/* ParseTuple converter: decode bytes objects to unicode using
762 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
763
764PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
765
Victor Stinner77c38622010-05-14 15:58:55 +0000766/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
767 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000768
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000769 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
770 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000771
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000772 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +0000773*/
774
775PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
776 const char *s /* encoded string */
777 );
778
Victor Stinner77c38622010-05-14 15:58:55 +0000779/* Decode a string using Py_FileSystemDefaultEncoding
780 and the "surrogateescape" error handler.
781
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000782 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
783 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +0000784*/
785
Martin v. Löwis011e8422009-05-05 04:43:17 +0000786PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
787 const char *s, /* encoded string */
788 Py_ssize_t size /* size */
789 );
790
Victor Stinnerae6265f2010-05-15 16:27:27 +0000791/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +0000792 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000793
Victor Stinnerf3170cc2010-10-15 12:04:23 +0000794 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
795 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +0000796*/
797
798PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
799 PyObject *unicode
800 );
801
Guido van Rossumd8225182000-03-10 22:33:05 +0000802/* --- Methods & Slots ----------------------------------------------------
803
804 These are capable of handling Unicode objects and strings on input
805 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200806 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000807
808/* Concat two strings giving a new Unicode string. */
809
Mark Hammond91a681d2002-08-12 07:21:58 +0000810PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 PyObject *left, /* Left string */
812 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000813 );
814
Walter Dörwald1ab83302007-05-18 17:15:44 +0000815/* Concat two strings and put the result in *pleft
816 (sets *pleft to NULL on error) */
817
818PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 PyObject **pleft, /* Pointer to left string */
820 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000821 );
822
823/* Concat two strings, put the result in *pleft and drop the right object
824 (sets *pleft to NULL on error) */
825
826PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 PyObject **pleft, /* Pointer to left string */
828 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +0000829 );
830
Guido van Rossumd8225182000-03-10 22:33:05 +0000831/* Split a string giving a list of Unicode strings.
832
833 If sep is NULL, splitting will be done at all whitespace
834 substrings. Otherwise, splits occur at the given separator.
835
836 At most maxsplit splits will be done. If negative, no limit is set.
837
838 Separators are not included in the resulting list.
839
840*/
841
Mark Hammond91a681d2002-08-12 07:21:58 +0000842PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 PyObject *s, /* String to split */
844 PyObject *sep, /* String separator */
845 Py_ssize_t maxsplit /* Maxsplit count */
846 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000847
848/* Dito, but split at line breaks.
849
850 CRLF is considered to be one line break. Line breaks are not
851 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852
Mark Hammond91a681d2002-08-12 07:21:58 +0000853PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 PyObject *s, /* String to split */
855 int keepends /* If true, line end markers are included */
856 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000857
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858/* Partition a string using a given separator. */
859
860PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 PyObject *s, /* String to partition */
862 PyObject *sep /* String separator */
863 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864
865/* Partition a string using a given separator, searching from the end of the
866 string. */
867
868PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869 PyObject *s, /* String to partition */
870 PyObject *sep /* String separator */
871 );
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000873/* Split a string giving a list of Unicode strings.
874
875 If sep is NULL, splitting will be done at all whitespace
876 substrings. Otherwise, splits occur at the given separator.
877
878 At most maxsplit splits will be done. But unlike PyUnicode_Split
879 PyUnicode_RSplit splits from the end of the string. If negative,
880 no limit is set.
881
882 Separators are not included in the resulting list.
883
884*/
885
886PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887 PyObject *s, /* String to split */
888 PyObject *sep, /* String separator */
889 Py_ssize_t maxsplit /* Maxsplit count */
890 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000891
Guido van Rossumd8225182000-03-10 22:33:05 +0000892/* Translate a string by applying a character mapping table to it and
893 return the resulting Unicode object.
894
Serhiy Storchakac85a2662017-03-19 08:15:17 +0200895 The mapping table must map Unicode ordinal integers to Unicode strings,
896 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +0000897
898 Mapping tables may be dictionaries or sequences. Unmapped character
899 ordinals (ones which cause a LookupError) are left untouched and
900 are copied as-is.
901
902*/
903
Mark Hammond91a681d2002-08-12 07:21:58 +0000904PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000905 PyObject *str, /* String */
906 PyObject *table, /* Translate table */
907 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000908 );
909
910/* Join a sequence of strings using the given separator and return
911 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000912
Mark Hammond91a681d2002-08-12 07:21:58 +0000913PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 PyObject *separator, /* Separator string */
915 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000916 );
917
918/* Return 1 if substr matches str[start:end] at the given tail end, 0
919 otherwise. */
920
Martin v. Löwis18e16552006-02-15 17:27:45 +0000921PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000922 PyObject *str, /* String */
923 PyObject *substr, /* Prefix or Suffix string */
924 Py_ssize_t start, /* Start index */
925 Py_ssize_t end, /* Stop index */
926 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +0000927 );
928
929/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +0000930 given search direction or -1 if not found. -2 is returned in case
931 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000932
Martin v. Löwis18e16552006-02-15 17:27:45 +0000933PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000934 PyObject *str, /* String */
935 PyObject *substr, /* Substring to find */
936 Py_ssize_t start, /* Start index */
937 Py_ssize_t end, /* Stop index */
938 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +0000939 );
940
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200941#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942/* Like PyUnicode_Find, but search for single character only. */
943PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
944 PyObject *str,
945 Py_UCS4 ch,
946 Py_ssize_t start,
947 Py_ssize_t end,
948 int direction
949 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200950#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951
Barry Warsaw51ac5802000-03-20 16:36:48 +0000952/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000953
Martin v. Löwis18e16552006-02-15 17:27:45 +0000954PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 PyObject *str, /* String */
956 PyObject *substr, /* Substring to count */
957 Py_ssize_t start, /* Start index */
958 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +0000959 );
960
Barry Warsaw51ac5802000-03-20 16:36:48 +0000961/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000962 and return the resulting Unicode object. */
963
Mark Hammond91a681d2002-08-12 07:21:58 +0000964PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000965 PyObject *str, /* String */
966 PyObject *substr, /* Substring to find */
967 PyObject *replstr, /* Substring to replace */
968 Py_ssize_t maxcount /* Max. number of replacements to apply;
969 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +0000970 );
971
972/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +0200973 greater than resp.
974 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000975
Mark Hammond91a681d2002-08-12 07:21:58 +0000976PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 PyObject *left, /* Left string */
978 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000979 );
980
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200981/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
982 equal, and greater than, respectively. It is best to pass only
983 ASCII-encoded strings, but the function interprets the input string as
984 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +0200985 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +0200986
Martin v. Löwis5b222132007-06-10 09:51:05 +0000987PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
988 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000989 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +0000990 );
991
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000992/* Rich compare two strings and return one of the following:
993
994 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +0000995 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000996 - Py_NotImplemented in case the type combination is unknown
997
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000998 Possible values for op:
999
1000 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1001
1002*/
1003
1004PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005 PyObject *left, /* Left string */
1006 PyObject *right, /* Right string */
1007 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001008 );
1009
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001010/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 the resulting Unicode string. */
1012
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 PyObject *format, /* Format string */
1015 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 );
1017
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001018/* Checks whether element is contained in container and return 1/0
1019 accordingly.
1020
Martin Pantercc71a792016-04-05 06:19:42 +00001021 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001022 returned in case of an error. */
1023
Mark Hammond91a681d2002-08-12 07:21:58 +00001024PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001025 PyObject *container, /* Container string */
1026 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001027 );
1028
Martin v. Löwis47383402007-08-15 07:32:56 +00001029/* Checks whether argument is a valid identifier. */
1030
1031PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1032
Guido van Rossumd8225182000-03-10 22:33:05 +00001033/* === Characters Type APIs =============================================== */
1034
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001035#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001036PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01001037 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001038 int check_content);
T. Woutersa00c3fd2017-03-31 09:14:41 -07001039#elif !defined(NDEBUG)
1040/* For asserts that call _PyUnicode_CheckConsistency(), which would
1041 * otherwise be a problem when building with asserts but without Py_DEBUG. */
1042#define _PyUnicode_CheckConsistency(op, check_content) PyUnicode_Check(op)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001043#endif
1044
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03001045#ifndef Py_LIMITED_API
Victor Stinner75e46992018-11-26 17:29:38 +01001046# define Py_CPYTHON_UNICODEOBJECT_H
1047# include "cpython/unicodeobject.h"
1048# undef Py_CPYTHON_UNICODEOBJECT_H
1049#endif
Raymond Hettingerac2ef652015-07-04 16:04:44 -07001050
Guido van Rossumd8225182000-03-10 22:33:05 +00001051#ifdef __cplusplus
1052}
1053#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001054#endif /* !Py_UNICODEOBJECT_H */