Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1 | #ifndef Py_UNICODEOBJECT_H |
| 2 | #define Py_UNICODEOBJECT_H |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 3 | |
Christian Heimes | af98da1 | 2008-01-27 15:18:18 +0000 | [diff] [blame] | 4 | #include <stdarg.h> |
| 5 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 6 | /* |
| 7 | |
| 8 | Unicode implementation based on original code by Fredrik Lundh, |
| 9 | modified by Marc-Andre Lemburg (mal@lemburg.com) according to the |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 10 | Unicode Integration Proposal. (See |
| 11 | http://www.egenix.com/files/python/unicode-proposal.txt). |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 12 | |
Guido van Rossum | 16b1ad9 | 2000-08-03 16:24:25 +0000 | [diff] [blame] | 13 | Copyright (c) Corporation for National Research Initiatives. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 14 | |
| 15 | |
| 16 | Original header: |
| 17 | -------------------------------------------------------------------- |
| 18 | |
| 19 | * Yet another Unicode string type for Python. This type supports the |
| 20 | * 16-bit Basic Multilingual Plane (BMP) only. |
| 21 | * |
| 22 | * Written by Fredrik Lundh, January 1999. |
| 23 | * |
| 24 | * Copyright (c) 1999 by Secret Labs AB. |
| 25 | * Copyright (c) 1999 by Fredrik Lundh. |
| 26 | * |
| 27 | * fredrik@pythonware.com |
| 28 | * http://www.pythonware.com |
| 29 | * |
| 30 | * -------------------------------------------------------------------- |
| 31 | * This Unicode String Type is |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 32 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 33 | * Copyright (c) 1999 by Secret Labs AB |
| 34 | * Copyright (c) 1999 by Fredrik Lundh |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 35 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 36 | * By obtaining, using, and/or copying this software and/or its |
| 37 | * associated documentation, you agree that you have read, understood, |
| 38 | * and will comply with the following terms and conditions: |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 39 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 40 | * Permission to use, copy, modify, and distribute this software and its |
| 41 | * associated documentation for any purpose and without fee is hereby |
| 42 | * granted, provided that the above copyright notice appears in all |
| 43 | * copies, and that both that copyright notice and this permission notice |
| 44 | * appear in supporting documentation, and that the name of Secret Labs |
| 45 | * AB or the author not be used in advertising or publicity pertaining to |
| 46 | * distribution of the software without specific, written prior |
| 47 | * permission. |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 48 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 49 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
| 50 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 51 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
| 52 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 53 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 54 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| 55 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 56 | * -------------------------------------------------------------------- */ |
| 57 | |
Marc-André Lemburg | 5e6007c | 2001-09-19 11:21:03 +0000 | [diff] [blame] | 58 | #include <ctype.h> |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 59 | |
| 60 | /* === Internal API ======================================================= */ |
| 61 | |
| 62 | /* --- Internal Unicode Format -------------------------------------------- */ |
| 63 | |
Christian Heimes | 0625e89 | 2008-01-07 21:04:21 +0000 | [diff] [blame] | 64 | /* Python 3.x requires unicode */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 65 | #define Py_USING_UNICODE |
Christian Heimes | 0625e89 | 2008-01-07 21:04:21 +0000 | [diff] [blame] | 66 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 67 | #ifndef SIZEOF_WCHAR_T |
| 68 | #error Must define SIZEOF_WCHAR_T |
Fredrik Lundh | 9b14ab3 | 2001-06-26 22:59:49 +0000 | [diff] [blame] | 69 | #endif |
| 70 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 71 | #define Py_UNICODE_SIZE SIZEOF_WCHAR_T |
| 72 | |
| 73 | /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. |
| 74 | Otherwise, Unicode strings are stored as UCS-2 (with limited support |
| 75 | for UTF-16) */ |
Fredrik Lundh | 8f45585 | 2001-06-27 18:59:43 +0000 | [diff] [blame] | 76 | |
| 77 | #if Py_UNICODE_SIZE >= 4 |
| 78 | #define Py_UNICODE_WIDE |
Martin v. Löwis | 0ba70cc | 2001-06-26 22:22:37 +0000 | [diff] [blame] | 79 | #endif |
Fredrik Lundh | 1294ad0 | 2001-06-26 17:17:07 +0000 | [diff] [blame] | 80 | |
Amaury Forgeot d'Arc | feb7307 | 2010-09-12 22:42:57 +0000 | [diff] [blame] | 81 | /* Set these flags if the platform has "wchar.h" and the |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 82 | wchar_t type is a 16-bit unsigned type */ |
| 83 | /* #define HAVE_WCHAR_H */ |
| 84 | /* #define HAVE_USABLE_WCHAR_T */ |
| 85 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 86 | /* If the compiler provides a wchar_t type we try to support it |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 87 | through the interface functions PyUnicode_FromWideChar(), |
| 88 | PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 89 | |
| 90 | #ifdef HAVE_USABLE_WCHAR_T |
Marc-André Lemburg | 1a731c6 | 2000-08-11 11:43:10 +0000 | [diff] [blame] | 91 | # ifndef HAVE_WCHAR_H |
| 92 | # define HAVE_WCHAR_H |
| 93 | # endif |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 94 | #endif |
| 95 | |
| 96 | #ifdef HAVE_WCHAR_H |
Marc-André Lemburg | 5e6007c | 2001-09-19 11:21:03 +0000 | [diff] [blame] | 97 | # include <wchar.h> |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 98 | #endif |
| 99 | |
Georg Brandl | c6bc4c6 | 2011-10-05 16:23:09 +0200 | [diff] [blame] | 100 | /* Py_UCS4 and Py_UCS2 are typedefs for the respective |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 101 | unicode representations. */ |
Benjamin Peterson | a13e367 | 2016-09-08 11:38:28 -0700 | [diff] [blame] | 102 | typedef uint32_t Py_UCS4; |
| 103 | typedef uint16_t Py_UCS2; |
| 104 | typedef uint8_t Py_UCS1; |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 105 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 106 | #ifdef __cplusplus |
| 107 | extern "C" { |
| 108 | #endif |
| 109 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 110 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 111 | PyAPI_DATA(PyTypeObject) PyUnicode_Type; |
Christian Heimes | a22e8bd | 2007-11-29 22:35:39 +0000 | [diff] [blame] | 112 | PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 113 | |
Thomas Wouters | 27d517b | 2007-02-25 20:39:11 +0000 | [diff] [blame] | 114 | #define PyUnicode_Check(op) \ |
Christian Heimes | 90aa764 | 2007-12-19 02:45:37 +0000 | [diff] [blame] | 115 | PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) |
Dong-hee Na | d905df7 | 2020-02-14 02:37:17 +0900 | [diff] [blame] | 116 | #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type) |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 117 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 118 | /* --- Constants ---------------------------------------------------------- */ |
| 119 | |
| 120 | /* This Unicode character will be used as replacement character during |
| 121 | decoding if the errors argument is set to "replace". Note: the |
| 122 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in |
| 123 | Unicode 3.0. */ |
| 124 | |
Victor Stinner | 5ce1b0d | 2011-09-28 20:29:27 +0200 | [diff] [blame] | 125 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 126 | |
| 127 | /* === Public API ========================================================= */ |
| 128 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 129 | /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 130 | PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( |
Victor Stinner | 0d71116 | 2010-12-27 02:39:20 +0000 | [diff] [blame] | 131 | const char *u, /* UTF-8 encoded string */ |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 132 | Py_ssize_t size /* size of buffer */ |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 133 | ); |
| 134 | |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 135 | /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 136 | UTF-8 encoded bytes. The size is determined with strlen(). */ |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 137 | PyAPI_FUNC(PyObject*) PyUnicode_FromString( |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 138 | const char *u /* UTF-8 encoded string */ |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 139 | ); |
| 140 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 141 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 142 | PyAPI_FUNC(PyObject*) PyUnicode_Substring( |
| 143 | PyObject *str, |
| 144 | Py_ssize_t start, |
| 145 | Py_ssize_t end); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 146 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 147 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 148 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Georg Brandl | db6c7f5 | 2011-10-07 11:19:11 +0200 | [diff] [blame] | 149 | /* Copy the string into a UCS4 buffer including the null character if copy_null |
Serhiy Storchaka | cc16423 | 2016-10-02 21:29:26 +0300 | [diff] [blame] | 150 | is set. Return NULL and raise an exception on error. Raise a SystemError if |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 151 | the buffer is smaller than the string. Return buffer on success. |
| 152 | |
| 153 | buflen is the length of the buffer in (Py_UCS4) characters. */ |
| 154 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( |
| 155 | PyObject *unicode, |
| 156 | Py_UCS4* buffer, |
| 157 | Py_ssize_t buflen, |
| 158 | int copy_null); |
| 159 | |
| 160 | /* Copy the string into a UCS4 buffer. A new buffer is allocated using |
| 161 | * PyMem_Malloc; if this fails, NULL is returned with a memory error |
| 162 | exception set. */ |
| 163 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 164 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 165 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 166 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 167 | /* Get the length of the Unicode object. */ |
| 168 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 169 | PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( |
| 170 | PyObject *unicode |
| 171 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 172 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 173 | |
Victor Stinner | 157f83f | 2011-09-28 21:41:31 +0200 | [diff] [blame] | 174 | /* Get the number of Py_UNICODE units in the |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 175 | string representation. */ |
| 176 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 177 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 178 | PyObject *unicode /* Unicode object */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 179 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 180 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 181 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 182 | /* Read a character from the string. */ |
| 183 | |
| 184 | PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( |
| 185 | PyObject *unicode, |
| 186 | Py_ssize_t index |
| 187 | ); |
| 188 | |
| 189 | /* Write a character to the string. The string must have been created through |
Victor Stinner | cd9950f | 2011-10-02 00:34:53 +0200 | [diff] [blame] | 190 | PyUnicode_New, must not be shared, and must not have been hashed yet. |
| 191 | |
| 192 | Return 0 on success, -1 on error. */ |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 193 | |
| 194 | PyAPI_FUNC(int) PyUnicode_WriteChar( |
| 195 | PyObject *unicode, |
| 196 | Py_ssize_t index, |
| 197 | Py_UCS4 character |
| 198 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 199 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 200 | |
Martin Panter | 6245cb3 | 2016-04-15 02:14:19 +0000 | [diff] [blame] | 201 | /* Resize a Unicode object. The length is the number of characters, except |
Victor Stinner | b0a82a6 | 2011-12-12 13:08:33 +0100 | [diff] [blame] | 202 | if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length |
| 203 | is the number of Py_UNICODE characters. |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 204 | |
| 205 | *unicode is modified to point to the new (resized) object and 0 |
| 206 | returned on success. |
| 207 | |
Victor Stinner | b0a82a6 | 2011-12-12 13:08:33 +0100 | [diff] [blame] | 208 | Try to resize the string in place (which is usually faster than allocating |
| 209 | a new string and copy characters), or create a new string. |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 210 | |
| 211 | Error handling is implemented as follows: an exception is set, -1 |
Victor Stinner | 16e6a80 | 2011-12-12 13:24:15 +0100 | [diff] [blame] | 212 | is returned and *unicode left untouched. |
| 213 | |
| 214 | WARNING: The function doesn't check string content, the result may not be a |
| 215 | string in canonical representation. */ |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 216 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 217 | PyAPI_FUNC(int) PyUnicode_Resize( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 218 | PyObject **unicode, /* Pointer to the Unicode object */ |
| 219 | Py_ssize_t length /* New length */ |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 220 | ); |
| 221 | |
Serhiy Storchaka | 6a7b3a7 | 2016-04-17 08:32:47 +0300 | [diff] [blame] | 222 | /* Decode obj to a Unicode object. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 223 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 224 | bytes, bytearray and other bytes-like objects are decoded according to the |
| 225 | given encoding and error handler. The encoding and error handler can be |
| 226 | NULL to have the interface use UTF-8 and "strict". |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 227 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 228 | All other objects (including Unicode objects) raise an exception. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 229 | |
| 230 | The API returns NULL in case of an error. The caller is responsible |
| 231 | for decref'ing the returned objects. |
| 232 | |
| 233 | */ |
| 234 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 235 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 236 | PyObject *obj, /* Object */ |
Marc-André Lemburg | 5a5c81a | 2000-07-07 13:46:42 +0000 | [diff] [blame] | 237 | const char *encoding, /* encoding */ |
| 238 | const char *errors /* error handling */ |
| 239 | ); |
| 240 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 241 | /* Copy an instance of a Unicode subtype to a new true Unicode object if |
| 242 | necessary. If obj is already a true Unicode object (not a subtype), return |
| 243 | the reference with *incremented* refcount. |
Marc-André Lemburg | 5a5c81a | 2000-07-07 13:46:42 +0000 | [diff] [blame] | 244 | |
| 245 | The API returns NULL in case of an error. The caller is responsible |
| 246 | for decref'ing the returned objects. |
| 247 | |
| 248 | */ |
| 249 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 250 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 251 | PyObject *obj /* Object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 252 | ); |
| 253 | |
Victor Stinner | 1205f27 | 2010-09-11 00:54:47 +0000 | [diff] [blame] | 254 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( |
| 255 | const char *format, /* ASCII-encoded string */ |
| 256 | va_list vargs |
| 257 | ); |
| 258 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( |
| 259 | const char *format, /* ASCII-encoded string */ |
| 260 | ... |
| 261 | ); |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 262 | |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 263 | PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 264 | PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( |
| 265 | const char *u /* UTF-8 encoded string */ |
| 266 | ); |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 267 | |
Victor Stinner | 583ee5a | 2020-10-02 14:49:00 +0200 | [diff] [blame] | 268 | // PyUnicode_InternImmortal() is deprecated since Python 3.10 |
| 269 | // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead. |
| 270 | Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); |
| 271 | |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 272 | /* Use only if you know it's a string */ |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 273 | #define PyUnicode_CHECK_INTERNED(op) \ |
| 274 | (((PyASCIIObject *)(op))->state.interned) |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 275 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 276 | /* --- wchar_t support for platforms which support it --------------------- */ |
| 277 | |
| 278 | #ifdef HAVE_WCHAR_H |
| 279 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 280 | /* Create a Unicode Object from the wchar_t buffer w of the given |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 281 | size. |
| 282 | |
| 283 | The buffer is copied into the new object. */ |
| 284 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 285 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 286 | const wchar_t *w, /* wchar_t buffer */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 287 | Py_ssize_t size /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 288 | ); |
| 289 | |
Marc-André Lemburg | a9cadcd | 2004-11-22 13:02:31 +0000 | [diff] [blame] | 290 | /* Copies the Unicode Object contents into the wchar_t buffer w. At |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 291 | most size wchar_t characters are copied. |
| 292 | |
Marc-André Lemburg | a9cadcd | 2004-11-22 13:02:31 +0000 | [diff] [blame] | 293 | Note that the resulting wchar_t string may or may not be |
| 294 | 0-terminated. It is the responsibility of the caller to make sure |
| 295 | that the wchar_t string is 0-terminated in case this is required by |
| 296 | the application. |
| 297 | |
| 298 | Returns the number of wchar_t characters copied (excluding a |
| 299 | possibly trailing 0-termination character) or -1 in case of an |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 300 | error. */ |
| 301 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 302 | PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( |
Martin v. Löwis | 4d0d471 | 2010-12-03 20:14:31 +0000 | [diff] [blame] | 303 | PyObject *unicode, /* Unicode object */ |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 304 | wchar_t *w, /* wchar_t buffer */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 305 | Py_ssize_t size /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 306 | ); |
| 307 | |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 308 | /* Convert the Unicode object to a wide character string. The output string |
| 309 | always ends with a nul character. If size is not NULL, write the number of |
Victor Stinner | d88d983 | 2011-09-06 02:00:05 +0200 | [diff] [blame] | 310 | wide characters (excluding the null character) into *size. |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 311 | |
Victor Stinner | 22fabe2 | 2015-02-11 18:17:56 +0100 | [diff] [blame] | 312 | Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 313 | on success. On error, returns NULL, *size is undefined and raises a |
| 314 | MemoryError. */ |
| 315 | |
| 316 | PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( |
Victor Stinner | beb4135b | 2010-10-07 01:02:42 +0000 | [diff] [blame] | 317 | PyObject *unicode, /* Unicode object */ |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 318 | Py_ssize_t *size /* number of characters of the result */ |
| 319 | ); |
| 320 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 321 | #endif |
| 322 | |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 323 | /* --- Unicode ordinals --------------------------------------------------- */ |
| 324 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 325 | /* Create a Unicode Object from the given Unicode code point ordinal. |
| 326 | |
Ezio Melotti | e7f9037 | 2012-10-05 03:33:31 +0300 | [diff] [blame] | 327 | The ordinal must be in range(0x110000). A ValueError is |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 328 | raised in case it is not. |
| 329 | |
| 330 | */ |
| 331 | |
Marc-André Lemburg | 9c329de | 2002-08-12 08:19:10 +0000 | [diff] [blame] | 332 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 333 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 334 | /* === Builtin Codecs ===================================================== |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 335 | |
| 336 | Many of these APIs take two arguments encoding and errors. These |
| 337 | parameters encoding and errors have the same semantics as the ones |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 338 | of the builtin str() API. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 339 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 340 | Setting encoding to NULL causes the default encoding (UTF-8) to be used. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 341 | |
| 342 | Error handling is set by errors which may also be set to NULL |
| 343 | meaning to use the default handling defined for the codec. Default |
| 344 | error handling for all builtin codecs is "strict" (ValueErrors are |
| 345 | raised). |
| 346 | |
| 347 | The codecs all use a similar interface. Only deviation from the |
| 348 | generic ones are documented. |
| 349 | |
| 350 | */ |
| 351 | |
Fred Drake | cb093fe | 2000-05-09 19:51:53 +0000 | [diff] [blame] | 352 | /* --- Manage the default encoding ---------------------------------------- */ |
| 353 | |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 354 | /* Returns "utf-8". */ |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 355 | PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); |
Fred Drake | cb093fe | 2000-05-09 19:51:53 +0000 | [diff] [blame] | 356 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 357 | /* --- Generic Codecs ----------------------------------------------------- */ |
| 358 | |
| 359 | /* Create a Unicode object by decoding the encoded string s of the |
| 360 | given size. */ |
| 361 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 362 | PyAPI_FUNC(PyObject*) PyUnicode_Decode( |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 363 | const char *s, /* encoded string */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 364 | Py_ssize_t size, /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 365 | const char *encoding, /* encoding */ |
| 366 | const char *errors /* error handling */ |
| 367 | ); |
| 368 | |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 369 | /* Decode a Unicode object unicode and return the result as Python |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 370 | object. |
| 371 | |
| 372 | This API is DEPRECATED. The only supported standard encoding is rot13. |
| 373 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
| 374 | that decode from str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 375 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 376 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 377 | PyObject *unicode, /* Unicode object */ |
| 378 | const char *encoding, /* encoding */ |
| 379 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 380 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 381 | |
| 382 | /* Decode a Unicode object unicode and return the result as Unicode |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 383 | object. |
| 384 | |
| 385 | This API is DEPRECATED. The only supported standard encoding is rot13. |
| 386 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
| 387 | that decode from str to str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 388 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 389 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 390 | PyObject *unicode, /* Unicode object */ |
| 391 | const char *encoding, /* encoding */ |
| 392 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 393 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 394 | |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 395 | /* Encodes a Unicode object and returns the result as Python |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 396 | object. |
| 397 | |
Ville Skyttä | 49b2734 | 2017-08-03 09:00:59 +0300 | [diff] [blame] | 398 | This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 399 | since all standard encodings (except rot13) encode str to bytes. |
| 400 | Use PyCodec_Encode() for encoding with rot13 and non-standard codecs |
| 401 | that encode form str to non-bytes. */ |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 402 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 403 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 404 | PyObject *unicode, /* Unicode object */ |
| 405 | const char *encoding, /* encoding */ |
| 406 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 407 | ); |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 408 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 409 | /* Encodes a Unicode object and returns the result as Python string |
| 410 | object. */ |
| 411 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 412 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 413 | PyObject *unicode, /* Unicode object */ |
| 414 | const char *encoding, /* encoding */ |
| 415 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 416 | ); |
| 417 | |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 418 | /* Encodes a Unicode object and returns the result as Unicode |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 419 | object. |
| 420 | |
| 421 | This API is DEPRECATED. The only supported standard encodings is rot13. |
| 422 | Use PyCodec_Encode() to encode with rot13 and non-standard codecs |
| 423 | that encode from str to str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 424 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 425 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 426 | PyObject *unicode, /* Unicode object */ |
| 427 | const char *encoding, /* encoding */ |
| 428 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 429 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 430 | |
| 431 | /* Build an encoding map. */ |
| 432 | |
Thomas Wouters | 73e5a5b | 2006-06-08 15:35:45 +0000 | [diff] [blame] | 433 | PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( |
| 434 | PyObject* string /* 256 character map */ |
| 435 | ); |
| 436 | |
Marc-André Lemburg | c60e6f7 | 2001-09-20 10:35:46 +0000 | [diff] [blame] | 437 | /* --- UTF-7 Codecs ------------------------------------------------------- */ |
| 438 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 439 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 440 | const char *string, /* UTF-7 encoded string */ |
| 441 | Py_ssize_t length, /* size of string */ |
| 442 | const char *errors /* error handling */ |
Marc-André Lemburg | c60e6f7 | 2001-09-20 10:35:46 +0000 | [diff] [blame] | 443 | ); |
| 444 | |
Christian Heimes | 5d14c2b | 2007-11-20 23:38:09 +0000 | [diff] [blame] | 445 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 446 | const char *string, /* UTF-7 encoded string */ |
| 447 | Py_ssize_t length, /* size of string */ |
| 448 | const char *errors, /* error handling */ |
| 449 | Py_ssize_t *consumed /* bytes consumed */ |
Christian Heimes | 5d14c2b | 2007-11-20 23:38:09 +0000 | [diff] [blame] | 450 | ); |
| 451 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 452 | /* --- UTF-8 Codecs ------------------------------------------------------- */ |
| 453 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 454 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 455 | const char *string, /* UTF-8 encoded string */ |
| 456 | Py_ssize_t length, /* size of string */ |
| 457 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 458 | ); |
| 459 | |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 460 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 461 | const char *string, /* UTF-8 encoded string */ |
| 462 | Py_ssize_t length, /* size of string */ |
| 463 | const char *errors, /* error handling */ |
| 464 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 465 | ); |
| 466 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 467 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 468 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 469 | ); |
| 470 | |
Alex Gaynor | 3a8fdb2 | 2020-10-19 18:17:50 -0400 | [diff] [blame] | 471 | /* Returns a pointer to the default encoding (UTF-8) of the |
| 472 | Unicode object unicode and the size of the encoded representation |
| 473 | in bytes stored in *size. |
| 474 | |
| 475 | In case of an error, no *size is set. |
| 476 | |
| 477 | This function caches the UTF-8 encoded string in the unicodeobject |
| 478 | and subsequent calls will return the same string. The memory is released |
| 479 | when the unicodeobject is deallocated. |
| 480 | */ |
| 481 | |
| 482 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 |
| 483 | PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( |
| 484 | PyObject *unicode, |
| 485 | Py_ssize_t *size); |
| 486 | #endif |
| 487 | |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 488 | /* --- UTF-32 Codecs ------------------------------------------------------ */ |
| 489 | |
| 490 | /* Decodes length bytes from a UTF-32 encoded buffer string and returns |
| 491 | the corresponding Unicode object. |
| 492 | |
| 493 | errors (if non-NULL) defines the error handling. It defaults |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 494 | to "strict". |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 495 | |
| 496 | If byteorder is non-NULL, the decoder starts decoding using the |
| 497 | given byte order: |
| 498 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 499 | *byteorder == -1: little endian |
| 500 | *byteorder == 0: native order |
| 501 | *byteorder == 1: big endian |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 502 | |
| 503 | In native mode, the first four bytes of the stream are checked for a |
| 504 | BOM mark. If found, the BOM mark is analysed, the byte order |
| 505 | adjusted and the BOM skipped. In the other modes, no BOM mark |
| 506 | interpretation is done. After completion, *byteorder is set to the |
| 507 | current byte order at the end of input data. |
| 508 | |
| 509 | If byteorder is NULL, the codec starts in native order mode. |
| 510 | |
| 511 | */ |
| 512 | |
| 513 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 514 | const char *string, /* UTF-32 encoded string */ |
| 515 | Py_ssize_t length, /* size of string */ |
| 516 | const char *errors, /* error handling */ |
| 517 | int *byteorder /* pointer to byteorder to use |
| 518 | 0=native;-1=LE,1=BE; updated on |
| 519 | exit */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 520 | ); |
| 521 | |
| 522 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 523 | const char *string, /* UTF-32 encoded string */ |
| 524 | Py_ssize_t length, /* size of string */ |
| 525 | const char *errors, /* error handling */ |
| 526 | int *byteorder, /* pointer to byteorder to use |
| 527 | 0=native;-1=LE,1=BE; updated on |
| 528 | exit */ |
| 529 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 530 | ); |
| 531 | |
| 532 | /* Returns a Python string using the UTF-32 encoding in native byte |
| 533 | order. The string always starts with a BOM mark. */ |
| 534 | |
| 535 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 536 | PyObject *unicode /* Unicode object */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 537 | ); |
| 538 | |
| 539 | /* Returns a Python string object holding the UTF-32 encoded value of |
| 540 | the Unicode data. |
| 541 | |
| 542 | If byteorder is not 0, output is written according to the following |
| 543 | byte order: |
| 544 | |
| 545 | byteorder == -1: little endian |
| 546 | byteorder == 0: native byte order (writes a BOM mark) |
| 547 | byteorder == 1: big endian |
| 548 | |
| 549 | If byteorder is 0, the output string will always start with the |
| 550 | Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is |
| 551 | prepended. |
| 552 | |
| 553 | */ |
| 554 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 555 | /* --- UTF-16 Codecs ------------------------------------------------------ */ |
| 556 | |
Guido van Rossum | 9e896b3 | 2000-04-05 20:11:21 +0000 | [diff] [blame] | 557 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 558 | the corresponding Unicode object. |
| 559 | |
| 560 | errors (if non-NULL) defines the error handling. It defaults |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 561 | to "strict". |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 562 | |
| 563 | If byteorder is non-NULL, the decoder starts decoding using the |
| 564 | given byte order: |
| 565 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 566 | *byteorder == -1: little endian |
| 567 | *byteorder == 0: native order |
| 568 | *byteorder == 1: big endian |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 569 | |
Marc-André Lemburg | 489b56e | 2001-05-21 20:30:15 +0000 | [diff] [blame] | 570 | In native mode, the first two bytes of the stream are checked for a |
| 571 | BOM mark. If found, the BOM mark is analysed, the byte order |
| 572 | adjusted and the BOM skipped. In the other modes, no BOM mark |
| 573 | interpretation is done. After completion, *byteorder is set to the |
| 574 | current byte order at the end of input data. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 575 | |
| 576 | If byteorder is NULL, the codec starts in native order mode. |
| 577 | |
| 578 | */ |
| 579 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 580 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 581 | const char *string, /* UTF-16 encoded string */ |
| 582 | Py_ssize_t length, /* size of string */ |
| 583 | const char *errors, /* error handling */ |
| 584 | int *byteorder /* pointer to byteorder to use |
| 585 | 0=native;-1=LE,1=BE; updated on |
| 586 | exit */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 587 | ); |
| 588 | |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 589 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 590 | const char *string, /* UTF-16 encoded string */ |
| 591 | Py_ssize_t length, /* size of string */ |
| 592 | const char *errors, /* error handling */ |
| 593 | int *byteorder, /* pointer to byteorder to use |
| 594 | 0=native;-1=LE,1=BE; updated on |
| 595 | exit */ |
| 596 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 597 | ); |
| 598 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 599 | /* Returns a Python string using the UTF-16 encoding in native byte |
| 600 | order. The string always starts with a BOM mark. */ |
| 601 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 602 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 603 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 604 | ); |
| 605 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 606 | /* --- Unicode-Escape Codecs ---------------------------------------------- */ |
| 607 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 608 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 609 | const char *string, /* Unicode-Escape encoded string */ |
| 610 | Py_ssize_t length, /* size of string */ |
| 611 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 612 | ); |
| 613 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 614 | PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 615 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 616 | ); |
| 617 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 618 | /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ |
| 619 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 620 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 621 | const char *string, /* Raw-Unicode-Escape encoded string */ |
| 622 | Py_ssize_t length, /* size of string */ |
| 623 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 624 | ); |
| 625 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 626 | PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 627 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 628 | ); |
| 629 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 630 | /* --- Latin-1 Codecs ----------------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 631 | |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 632 | Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 633 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 634 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 635 | const char *string, /* Latin-1 encoded string */ |
| 636 | Py_ssize_t length, /* size of string */ |
| 637 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 638 | ); |
| 639 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 640 | PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 641 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 642 | ); |
| 643 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 644 | /* --- ASCII Codecs ------------------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 645 | |
| 646 | Only 7-bit ASCII data is excepted. All other codes generate errors. |
| 647 | |
| 648 | */ |
| 649 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 650 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 651 | const char *string, /* ASCII encoded string */ |
| 652 | Py_ssize_t length, /* size of string */ |
| 653 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 654 | ); |
| 655 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 656 | PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 657 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 658 | ); |
| 659 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 660 | /* --- Character Map Codecs ----------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 661 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 662 | This codec uses mappings to encode and decode characters. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 663 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 664 | Decoding mappings must map byte ordinals (integers in the range from 0 to |
| 665 | 255) to Unicode strings, integers (which are then interpreted as Unicode |
| 666 | ordinals) or None. Unmapped data bytes (ones which cause a LookupError) |
| 667 | as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined |
| 668 | mapping" and cause an error. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 669 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 670 | Encoding mappings must map Unicode ordinal integers to bytes objects, |
| 671 | integers in the range from 0 to 255 or None. Unmapped character |
| 672 | ordinals (ones which cause a LookupError) as well as mapped to |
| 673 | None are treated as "undefined mapping" and cause an error. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 674 | |
| 675 | */ |
| 676 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 677 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 678 | const char *string, /* Encoded string */ |
| 679 | Py_ssize_t length, /* size of string */ |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 680 | PyObject *mapping, /* decoding mapping */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 681 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 682 | ); |
| 683 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 684 | PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 685 | PyObject *unicode, /* Unicode object */ |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 686 | PyObject *mapping /* encoding mapping */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 687 | ); |
| 688 | |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 689 | /* --- MBCS codecs for Windows -------------------------------------------- */ |
Guido van Rossum | 24bdb04 | 2000-03-28 20:29:59 +0000 | [diff] [blame] | 690 | |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 691 | #ifdef MS_WINDOWS |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 692 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 693 | const char *string, /* MBCS encoded string */ |
Steve Dower | f5aba58 | 2016-09-06 19:42:27 -0700 | [diff] [blame] | 694 | Py_ssize_t length, /* size of string */ |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 695 | const char *errors /* error handling */ |
| 696 | ); |
| 697 | |
Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 698 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( |
| 699 | const char *string, /* MBCS encoded string */ |
| 700 | Py_ssize_t length, /* size of string */ |
| 701 | const char *errors, /* error handling */ |
| 702 | Py_ssize_t *consumed /* bytes consumed */ |
| 703 | ); |
| 704 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 705 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 706 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( |
| 707 | int code_page, /* code page number */ |
| 708 | const char *string, /* encoded string */ |
| 709 | Py_ssize_t length, /* size of string */ |
| 710 | const char *errors, /* error handling */ |
| 711 | Py_ssize_t *consumed /* bytes consumed */ |
| 712 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 713 | #endif |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 714 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 715 | PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 716 | PyObject *unicode /* Unicode object */ |
| 717 | ); |
| 718 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 719 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 720 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( |
| 721 | int code_page, /* code page number */ |
| 722 | PyObject *unicode, /* Unicode object */ |
| 723 | const char *errors /* error handling */ |
| 724 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 725 | #endif |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 726 | |
Steve Dower | cc16be8 | 2016-09-08 10:35:16 -0700 | [diff] [blame] | 727 | #endif /* MS_WINDOWS */ |
Guido van Rossum | 24bdb04 | 2000-03-28 20:29:59 +0000 | [diff] [blame] | 728 | |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 729 | /* --- Locale encoding --------------------------------------------------- */ |
| 730 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 731 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 732 | /* Decode a string from the current locale encoding. The decoder is strict if |
| 733 | *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' |
| 734 | error handler (PEP 383) to escape undecodable bytes. If a byte sequence can |
| 735 | be decoded as a surrogate character and *surrogateescape* is not equal to |
| 736 | zero, the byte sequence is escaped using the 'surrogateescape' error handler |
| 737 | instead of being decoded. *str* must end with a null character but cannot |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 738 | contain embedded null characters. */ |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 739 | |
| 740 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( |
| 741 | const char *str, |
| 742 | Py_ssize_t len, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 743 | const char *errors); |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 744 | |
| 745 | /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string |
| 746 | length using strlen(). */ |
| 747 | |
| 748 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( |
| 749 | const char *str, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 750 | const char *errors); |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 751 | |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 752 | /* Encode a Unicode object to the current locale encoding. The encoder is |
| 753 | strict is *surrogateescape* is equal to zero, otherwise the |
| 754 | "surrogateescape" error handler is used. Return a bytes object. The string |
Victor Stinner | d45c7f8 | 2012-12-04 01:34:47 +0100 | [diff] [blame] | 755 | cannot contain embedded null characters. */ |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 756 | |
| 757 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( |
| 758 | PyObject *unicode, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 759 | const char *errors |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 760 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 761 | #endif |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 762 | |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 763 | /* --- File system encoding ---------------------------------------------- */ |
| 764 | |
Victor Stinner | 47fcb5b | 2010-08-13 23:59:58 +0000 | [diff] [blame] | 765 | /* ParseTuple converter: encode str objects to bytes using |
| 766 | PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 767 | |
| 768 | PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); |
| 769 | |
Victor Stinner | 47fcb5b | 2010-08-13 23:59:58 +0000 | [diff] [blame] | 770 | /* ParseTuple converter: decode bytes objects to unicode using |
| 771 | PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ |
| 772 | |
| 773 | PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); |
| 774 | |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 775 | /* Decode a null-terminated string using Py_FileSystemDefaultEncoding |
| 776 | and the "surrogateescape" error handler. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 777 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 778 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 779 | encoding. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 780 | |
Benjamin Peterson | ccbd694 | 2010-05-15 17:43:18 +0000 | [diff] [blame] | 781 | Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 782 | */ |
| 783 | |
| 784 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( |
| 785 | const char *s /* encoded string */ |
| 786 | ); |
| 787 | |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 788 | /* Decode a string using Py_FileSystemDefaultEncoding |
| 789 | and the "surrogateescape" error handler. |
| 790 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 791 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 792 | encoding. |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 793 | */ |
| 794 | |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 795 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( |
| 796 | const char *s, /* encoded string */ |
| 797 | Py_ssize_t size /* size */ |
| 798 | ); |
| 799 | |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 800 | /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the |
Benjamin Peterson | ccbd694 | 2010-05-15 17:43:18 +0000 | [diff] [blame] | 801 | "surrogateescape" error handler, and return bytes. |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 802 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 803 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 804 | encoding. |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 805 | */ |
| 806 | |
| 807 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( |
| 808 | PyObject *unicode |
| 809 | ); |
| 810 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 811 | /* --- Methods & Slots ---------------------------------------------------- |
| 812 | |
| 813 | These are capable of handling Unicode objects and strings on input |
| 814 | (we refer to them as strings in the descriptions) and return |
Georg Brandl | c6bc4c6 | 2011-10-05 16:23:09 +0200 | [diff] [blame] | 815 | Unicode objects or integers as appropriate. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 816 | |
| 817 | /* Concat two strings giving a new Unicode string. */ |
| 818 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 819 | PyAPI_FUNC(PyObject*) PyUnicode_Concat( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 820 | PyObject *left, /* Left string */ |
| 821 | PyObject *right /* Right string */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 822 | ); |
| 823 | |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 824 | /* Concat two strings and put the result in *pleft |
| 825 | (sets *pleft to NULL on error) */ |
| 826 | |
| 827 | PyAPI_FUNC(void) PyUnicode_Append( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 828 | PyObject **pleft, /* Pointer to left string */ |
| 829 | PyObject *right /* Right string */ |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 830 | ); |
| 831 | |
| 832 | /* Concat two strings, put the result in *pleft and drop the right object |
| 833 | (sets *pleft to NULL on error) */ |
| 834 | |
| 835 | PyAPI_FUNC(void) PyUnicode_AppendAndDel( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 836 | PyObject **pleft, /* Pointer to left string */ |
| 837 | PyObject *right /* Right string */ |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 838 | ); |
| 839 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 840 | /* Split a string giving a list of Unicode strings. |
| 841 | |
| 842 | If sep is NULL, splitting will be done at all whitespace |
| 843 | substrings. Otherwise, splits occur at the given separator. |
| 844 | |
| 845 | At most maxsplit splits will be done. If negative, no limit is set. |
| 846 | |
| 847 | Separators are not included in the resulting list. |
| 848 | |
| 849 | */ |
| 850 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 851 | PyAPI_FUNC(PyObject*) PyUnicode_Split( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 852 | PyObject *s, /* String to split */ |
| 853 | PyObject *sep, /* String separator */ |
| 854 | Py_ssize_t maxsplit /* Maxsplit count */ |
| 855 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 856 | |
| 857 | /* Dito, but split at line breaks. |
| 858 | |
| 859 | CRLF is considered to be one line break. Line breaks are not |
| 860 | included in the resulting list. */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 861 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 862 | PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 863 | PyObject *s, /* String to split */ |
| 864 | int keepends /* If true, line end markers are included */ |
| 865 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 866 | |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 867 | /* Partition a string using a given separator. */ |
| 868 | |
| 869 | PyAPI_FUNC(PyObject*) PyUnicode_Partition( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 870 | PyObject *s, /* String to partition */ |
| 871 | PyObject *sep /* String separator */ |
| 872 | ); |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 873 | |
| 874 | /* Partition a string using a given separator, searching from the end of the |
| 875 | string. */ |
| 876 | |
| 877 | PyAPI_FUNC(PyObject*) PyUnicode_RPartition( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 878 | PyObject *s, /* String to partition */ |
| 879 | PyObject *sep /* String separator */ |
| 880 | ); |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 881 | |
Hye-Shik Chang | 3ae811b | 2003-12-15 18:49:53 +0000 | [diff] [blame] | 882 | /* Split a string giving a list of Unicode strings. |
| 883 | |
| 884 | If sep is NULL, splitting will be done at all whitespace |
| 885 | substrings. Otherwise, splits occur at the given separator. |
| 886 | |
| 887 | At most maxsplit splits will be done. But unlike PyUnicode_Split |
| 888 | PyUnicode_RSplit splits from the end of the string. If negative, |
| 889 | no limit is set. |
| 890 | |
| 891 | Separators are not included in the resulting list. |
| 892 | |
| 893 | */ |
| 894 | |
| 895 | PyAPI_FUNC(PyObject*) PyUnicode_RSplit( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 896 | PyObject *s, /* String to split */ |
| 897 | PyObject *sep, /* String separator */ |
| 898 | Py_ssize_t maxsplit /* Maxsplit count */ |
| 899 | ); |
Hye-Shik Chang | 3ae811b | 2003-12-15 18:49:53 +0000 | [diff] [blame] | 900 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 901 | /* Translate a string by applying a character mapping table to it and |
| 902 | return the resulting Unicode object. |
| 903 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 904 | The mapping table must map Unicode ordinal integers to Unicode strings, |
| 905 | Unicode ordinal integers or None (causing deletion of the character). |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 906 | |
| 907 | Mapping tables may be dictionaries or sequences. Unmapped character |
| 908 | ordinals (ones which cause a LookupError) are left untouched and |
| 909 | are copied as-is. |
| 910 | |
| 911 | */ |
| 912 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 913 | PyAPI_FUNC(PyObject *) PyUnicode_Translate( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 914 | PyObject *str, /* String */ |
| 915 | PyObject *table, /* Translate table */ |
| 916 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 917 | ); |
| 918 | |
| 919 | /* Join a sequence of strings using the given separator and return |
| 920 | the resulting Unicode string. */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 921 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 922 | PyAPI_FUNC(PyObject*) PyUnicode_Join( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 923 | PyObject *separator, /* Separator string */ |
| 924 | PyObject *seq /* Sequence object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 925 | ); |
| 926 | |
| 927 | /* Return 1 if substr matches str[start:end] at the given tail end, 0 |
| 928 | otherwise. */ |
| 929 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 930 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 931 | PyObject *str, /* String */ |
| 932 | PyObject *substr, /* Prefix or Suffix string */ |
| 933 | Py_ssize_t start, /* Start index */ |
| 934 | Py_ssize_t end, /* Stop index */ |
| 935 | int direction /* Tail end: -1 prefix, +1 suffix */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 936 | ); |
| 937 | |
| 938 | /* Return the first position of substr in str[start:end] using the |
Marc-André Lemburg | 4da6fd6 | 2002-05-29 11:33:13 +0000 | [diff] [blame] | 939 | given search direction or -1 if not found. -2 is returned in case |
| 940 | an error occurred and an exception is set. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 941 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 942 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 943 | PyObject *str, /* String */ |
| 944 | PyObject *substr, /* Substring to find */ |
| 945 | Py_ssize_t start, /* Start index */ |
| 946 | Py_ssize_t end, /* Stop index */ |
| 947 | int direction /* Find direction: +1 forward, -1 backward */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 948 | ); |
| 949 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 950 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 951 | /* Like PyUnicode_Find, but search for single character only. */ |
| 952 | PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( |
| 953 | PyObject *str, |
| 954 | Py_UCS4 ch, |
| 955 | Py_ssize_t start, |
| 956 | Py_ssize_t end, |
| 957 | int direction |
| 958 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 959 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 960 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 961 | /* Count the number of occurrences of substr in str[start:end]. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 962 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 963 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 964 | PyObject *str, /* String */ |
| 965 | PyObject *substr, /* Substring to count */ |
| 966 | Py_ssize_t start, /* Start index */ |
| 967 | Py_ssize_t end /* Stop index */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 968 | ); |
| 969 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 970 | /* Replace at most maxcount occurrences of substr in str with replstr |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 971 | and return the resulting Unicode object. */ |
| 972 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 973 | PyAPI_FUNC(PyObject *) PyUnicode_Replace( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 974 | PyObject *str, /* String */ |
| 975 | PyObject *substr, /* Substring to find */ |
| 976 | PyObject *replstr, /* Substring to replace */ |
| 977 | Py_ssize_t maxcount /* Max. number of replacements to apply; |
| 978 | -1 = all */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 979 | ); |
| 980 | |
| 981 | /* Compare two strings and return -1, 0, 1 for less than, equal, |
Victor Stinner | 90db9c4 | 2012-10-04 21:53:50 +0200 | [diff] [blame] | 982 | greater than resp. |
| 983 | Raise an exception and return -1 on error. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 984 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 985 | PyAPI_FUNC(int) PyUnicode_Compare( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 986 | PyObject *left, /* Left string */ |
| 987 | PyObject *right /* Right string */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 988 | ); |
| 989 | |
Serhiy Storchaka | f5894dd | 2016-11-16 15:40:39 +0200 | [diff] [blame] | 990 | /* Compare a Unicode object with C string and return -1, 0, 1 for less than, |
| 991 | equal, and greater than, respectively. It is best to pass only |
| 992 | ASCII-encoded strings, but the function interprets the input string as |
| 993 | ISO-8859-1 if it contains non-ASCII characters. |
Serhiy Storchaka | 419967b | 2016-12-06 00:13:34 +0200 | [diff] [blame] | 994 | This function does not raise exceptions. */ |
Serhiy Storchaka | f5894dd | 2016-11-16 15:40:39 +0200 | [diff] [blame] | 995 | |
Martin v. Löwis | 5b22213 | 2007-06-10 09:51:05 +0000 | [diff] [blame] | 996 | PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( |
| 997 | PyObject *left, |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 998 | const char *right /* ASCII-encoded string */ |
Martin v. Löwis | 5b22213 | 2007-06-10 09:51:05 +0000 | [diff] [blame] | 999 | ); |
| 1000 | |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1001 | /* Rich compare two strings and return one of the following: |
| 1002 | |
| 1003 | - NULL in case an exception was raised |
Martin Panter | 69332c1 | 2016-08-04 13:07:31 +0000 | [diff] [blame] | 1004 | - Py_True or Py_False for successful comparisons |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1005 | - Py_NotImplemented in case the type combination is unknown |
| 1006 | |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1007 | Possible values for op: |
| 1008 | |
| 1009 | Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE |
| 1010 | |
| 1011 | */ |
| 1012 | |
| 1013 | PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1014 | PyObject *left, /* Left string */ |
| 1015 | PyObject *right, /* Right string */ |
| 1016 | int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1017 | ); |
| 1018 | |
Serhiy Storchaka | d65c949 | 2015-11-02 14:10:23 +0200 | [diff] [blame] | 1019 | /* Apply an argument tuple or dictionary to a format string and return |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1020 | the resulting Unicode string. */ |
| 1021 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 1022 | PyAPI_FUNC(PyObject *) PyUnicode_Format( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1023 | PyObject *format, /* Format string */ |
| 1024 | PyObject *args /* Argument tuple or dictionary */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1025 | ); |
| 1026 | |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1027 | /* Checks whether element is contained in container and return 1/0 |
| 1028 | accordingly. |
| 1029 | |
Martin Panter | cc71a79 | 2016-04-05 06:19:42 +0000 | [diff] [blame] | 1030 | element has to coerce to a one element Unicode string. -1 is |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1031 | returned in case of an error. */ |
| 1032 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 1033 | PyAPI_FUNC(int) PyUnicode_Contains( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1034 | PyObject *container, /* Container string */ |
| 1035 | PyObject *element /* Element string */ |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1036 | ); |
| 1037 | |
Martin v. Löwis | 4738340 | 2007-08-15 07:32:56 +0000 | [diff] [blame] | 1038 | /* Checks whether argument is a valid identifier. */ |
| 1039 | |
| 1040 | PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); |
| 1041 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1042 | /* === Characters Type APIs =============================================== */ |
| 1043 | |
Serhiy Storchaka | 9fab79b | 2016-09-11 11:03:14 +0300 | [diff] [blame] | 1044 | #ifndef Py_LIMITED_API |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 1045 | # define Py_CPYTHON_UNICODEOBJECT_H |
| 1046 | # include "cpython/unicodeobject.h" |
| 1047 | # undef Py_CPYTHON_UNICODEOBJECT_H |
| 1048 | #endif |
Raymond Hettinger | ac2ef65 | 2015-07-04 16:04:44 -0700 | [diff] [blame] | 1049 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1050 | #ifdef __cplusplus |
| 1051 | } |
| 1052 | #endif |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1053 | #endif /* !Py_UNICODEOBJECT_H */ |