Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1 | #ifndef Py_UNICODEOBJECT_H |
| 2 | #define Py_UNICODEOBJECT_H |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 3 | |
Christian Heimes | af98da1 | 2008-01-27 15:18:18 +0000 | [diff] [blame] | 4 | #include <stdarg.h> |
| 5 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 6 | /* |
| 7 | |
| 8 | Unicode implementation based on original code by Fredrik Lundh, |
| 9 | modified by Marc-Andre Lemburg (mal@lemburg.com) according to the |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 10 | Unicode Integration Proposal. (See |
| 11 | http://www.egenix.com/files/python/unicode-proposal.txt). |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 12 | |
Guido van Rossum | 16b1ad9 | 2000-08-03 16:24:25 +0000 | [diff] [blame] | 13 | Copyright (c) Corporation for National Research Initiatives. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 14 | |
| 15 | |
| 16 | Original header: |
| 17 | -------------------------------------------------------------------- |
| 18 | |
| 19 | * Yet another Unicode string type for Python. This type supports the |
| 20 | * 16-bit Basic Multilingual Plane (BMP) only. |
| 21 | * |
| 22 | * Written by Fredrik Lundh, January 1999. |
| 23 | * |
| 24 | * Copyright (c) 1999 by Secret Labs AB. |
| 25 | * Copyright (c) 1999 by Fredrik Lundh. |
| 26 | * |
| 27 | * fredrik@pythonware.com |
| 28 | * http://www.pythonware.com |
| 29 | * |
| 30 | * -------------------------------------------------------------------- |
| 31 | * This Unicode String Type is |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 32 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 33 | * Copyright (c) 1999 by Secret Labs AB |
| 34 | * Copyright (c) 1999 by Fredrik Lundh |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 35 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 36 | * By obtaining, using, and/or copying this software and/or its |
| 37 | * associated documentation, you agree that you have read, understood, |
| 38 | * and will comply with the following terms and conditions: |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 39 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 40 | * Permission to use, copy, modify, and distribute this software and its |
| 41 | * associated documentation for any purpose and without fee is hereby |
| 42 | * granted, provided that the above copyright notice appears in all |
| 43 | * copies, and that both that copyright notice and this permission notice |
| 44 | * appear in supporting documentation, and that the name of Secret Labs |
| 45 | * AB or the author not be used in advertising or publicity pertaining to |
| 46 | * distribution of the software without specific, written prior |
| 47 | * permission. |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 48 | * |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 49 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
| 50 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 51 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
| 52 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 53 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 54 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| 55 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 56 | * -------------------------------------------------------------------- */ |
| 57 | |
Marc-André Lemburg | 5e6007c | 2001-09-19 11:21:03 +0000 | [diff] [blame] | 58 | #include <ctype.h> |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 59 | |
| 60 | /* === Internal API ======================================================= */ |
| 61 | |
| 62 | /* --- Internal Unicode Format -------------------------------------------- */ |
| 63 | |
Christian Heimes | 0625e89 | 2008-01-07 21:04:21 +0000 | [diff] [blame] | 64 | /* Python 3.x requires unicode */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 65 | #define Py_USING_UNICODE |
Christian Heimes | 0625e89 | 2008-01-07 21:04:21 +0000 | [diff] [blame] | 66 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 67 | #ifndef SIZEOF_WCHAR_T |
| 68 | #error Must define SIZEOF_WCHAR_T |
Fredrik Lundh | 9b14ab3 | 2001-06-26 22:59:49 +0000 | [diff] [blame] | 69 | #endif |
| 70 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 71 | #define Py_UNICODE_SIZE SIZEOF_WCHAR_T |
| 72 | |
| 73 | /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. |
| 74 | Otherwise, Unicode strings are stored as UCS-2 (with limited support |
| 75 | for UTF-16) */ |
Fredrik Lundh | 8f45585 | 2001-06-27 18:59:43 +0000 | [diff] [blame] | 76 | |
| 77 | #if Py_UNICODE_SIZE >= 4 |
| 78 | #define Py_UNICODE_WIDE |
Martin v. Löwis | 0ba70cc | 2001-06-26 22:22:37 +0000 | [diff] [blame] | 79 | #endif |
Fredrik Lundh | 1294ad0 | 2001-06-26 17:17:07 +0000 | [diff] [blame] | 80 | |
Amaury Forgeot d'Arc | feb7307 | 2010-09-12 22:42:57 +0000 | [diff] [blame] | 81 | /* Set these flags if the platform has "wchar.h" and the |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 82 | wchar_t type is a 16-bit unsigned type */ |
| 83 | /* #define HAVE_WCHAR_H */ |
| 84 | /* #define HAVE_USABLE_WCHAR_T */ |
| 85 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 86 | /* If the compiler provides a wchar_t type we try to support it |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 87 | through the interface functions PyUnicode_FromWideChar(), |
| 88 | PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 89 | |
| 90 | #ifdef HAVE_USABLE_WCHAR_T |
Marc-André Lemburg | 1a731c6 | 2000-08-11 11:43:10 +0000 | [diff] [blame] | 91 | # ifndef HAVE_WCHAR_H |
| 92 | # define HAVE_WCHAR_H |
| 93 | # endif |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 94 | #endif |
| 95 | |
| 96 | #ifdef HAVE_WCHAR_H |
Marc-André Lemburg | 5e6007c | 2001-09-19 11:21:03 +0000 | [diff] [blame] | 97 | # include <wchar.h> |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 98 | #endif |
| 99 | |
Georg Brandl | c6bc4c6 | 2011-10-05 16:23:09 +0200 | [diff] [blame] | 100 | /* Py_UCS4 and Py_UCS2 are typedefs for the respective |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 101 | unicode representations. */ |
Benjamin Peterson | a13e367 | 2016-09-08 11:38:28 -0700 | [diff] [blame] | 102 | typedef uint32_t Py_UCS4; |
| 103 | typedef uint16_t Py_UCS2; |
| 104 | typedef uint8_t Py_UCS1; |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 105 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 106 | #ifdef __cplusplus |
| 107 | extern "C" { |
| 108 | #endif |
| 109 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 110 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 111 | PyAPI_DATA(PyTypeObject) PyUnicode_Type; |
Christian Heimes | a22e8bd | 2007-11-29 22:35:39 +0000 | [diff] [blame] | 112 | PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 113 | |
Thomas Wouters | 27d517b | 2007-02-25 20:39:11 +0000 | [diff] [blame] | 114 | #define PyUnicode_Check(op) \ |
Christian Heimes | 90aa764 | 2007-12-19 02:45:37 +0000 | [diff] [blame] | 115 | PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) |
| 116 | #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 117 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 118 | /* --- Constants ---------------------------------------------------------- */ |
| 119 | |
| 120 | /* This Unicode character will be used as replacement character during |
| 121 | decoding if the errors argument is set to "replace". Note: the |
| 122 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in |
| 123 | Unicode 3.0. */ |
| 124 | |
Victor Stinner | 5ce1b0d | 2011-09-28 20:29:27 +0200 | [diff] [blame] | 125 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 126 | |
| 127 | /* === Public API ========================================================= */ |
| 128 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 129 | /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 130 | PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( |
Victor Stinner | 0d71116 | 2010-12-27 02:39:20 +0000 | [diff] [blame] | 131 | const char *u, /* UTF-8 encoded string */ |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 132 | Py_ssize_t size /* size of buffer */ |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 133 | ); |
| 134 | |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 135 | /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 136 | UTF-8 encoded bytes. The size is determined with strlen(). */ |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 137 | PyAPI_FUNC(PyObject*) PyUnicode_FromString( |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 138 | const char *u /* UTF-8 encoded string */ |
Walter Dörwald | acaa5a1 | 2007-05-05 12:00:46 +0000 | [diff] [blame] | 139 | ); |
| 140 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 141 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 142 | PyAPI_FUNC(PyObject*) PyUnicode_Substring( |
| 143 | PyObject *str, |
| 144 | Py_ssize_t start, |
| 145 | Py_ssize_t end); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 146 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 147 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 148 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Georg Brandl | db6c7f5 | 2011-10-07 11:19:11 +0200 | [diff] [blame] | 149 | /* Copy the string into a UCS4 buffer including the null character if copy_null |
Serhiy Storchaka | cc16423 | 2016-10-02 21:29:26 +0300 | [diff] [blame] | 150 | is set. Return NULL and raise an exception on error. Raise a SystemError if |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 151 | the buffer is smaller than the string. Return buffer on success. |
| 152 | |
| 153 | buflen is the length of the buffer in (Py_UCS4) characters. */ |
| 154 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( |
| 155 | PyObject *unicode, |
| 156 | Py_UCS4* buffer, |
| 157 | Py_ssize_t buflen, |
| 158 | int copy_null); |
| 159 | |
| 160 | /* Copy the string into a UCS4 buffer. A new buffer is allocated using |
| 161 | * PyMem_Malloc; if this fails, NULL is returned with a memory error |
| 162 | exception set. */ |
| 163 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 164 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 165 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 166 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 167 | /* Get the length of the Unicode object. */ |
| 168 | |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 169 | PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( |
| 170 | PyObject *unicode |
| 171 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 172 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 173 | |
Victor Stinner | 157f83f | 2011-09-28 21:41:31 +0200 | [diff] [blame] | 174 | /* Get the number of Py_UNICODE units in the |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 175 | string representation. */ |
| 176 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 177 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 178 | PyObject *unicode /* Unicode object */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 179 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 180 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 181 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 182 | /* Read a character from the string. */ |
| 183 | |
| 184 | PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( |
| 185 | PyObject *unicode, |
| 186 | Py_ssize_t index |
| 187 | ); |
| 188 | |
| 189 | /* Write a character to the string. The string must have been created through |
Victor Stinner | cd9950f | 2011-10-02 00:34:53 +0200 | [diff] [blame] | 190 | PyUnicode_New, must not be shared, and must not have been hashed yet. |
| 191 | |
| 192 | Return 0 on success, -1 on error. */ |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 193 | |
| 194 | PyAPI_FUNC(int) PyUnicode_WriteChar( |
| 195 | PyObject *unicode, |
| 196 | Py_ssize_t index, |
| 197 | Py_UCS4 character |
| 198 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 199 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 200 | |
Martin Panter | 6245cb3 | 2016-04-15 02:14:19 +0000 | [diff] [blame] | 201 | /* Resize a Unicode object. The length is the number of characters, except |
Victor Stinner | b0a82a6 | 2011-12-12 13:08:33 +0100 | [diff] [blame] | 202 | if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length |
| 203 | is the number of Py_UNICODE characters. |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 204 | |
| 205 | *unicode is modified to point to the new (resized) object and 0 |
| 206 | returned on success. |
| 207 | |
Victor Stinner | b0a82a6 | 2011-12-12 13:08:33 +0100 | [diff] [blame] | 208 | Try to resize the string in place (which is usually faster than allocating |
| 209 | a new string and copy characters), or create a new string. |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 210 | |
| 211 | Error handling is implemented as follows: an exception is set, -1 |
Victor Stinner | 16e6a80 | 2011-12-12 13:24:15 +0100 | [diff] [blame] | 212 | is returned and *unicode left untouched. |
| 213 | |
| 214 | WARNING: The function doesn't check string content, the result may not be a |
| 215 | string in canonical representation. */ |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 216 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 217 | PyAPI_FUNC(int) PyUnicode_Resize( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 218 | PyObject **unicode, /* Pointer to the Unicode object */ |
| 219 | Py_ssize_t length /* New length */ |
Guido van Rossum | 52c2359 | 2000-04-10 13:41:41 +0000 | [diff] [blame] | 220 | ); |
| 221 | |
Serhiy Storchaka | 6a7b3a7 | 2016-04-17 08:32:47 +0300 | [diff] [blame] | 222 | /* Decode obj to a Unicode object. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 223 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 224 | bytes, bytearray and other bytes-like objects are decoded according to the |
| 225 | given encoding and error handler. The encoding and error handler can be |
| 226 | NULL to have the interface use UTF-8 and "strict". |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 227 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 228 | All other objects (including Unicode objects) raise an exception. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 229 | |
| 230 | The API returns NULL in case of an error. The caller is responsible |
| 231 | for decref'ing the returned objects. |
| 232 | |
| 233 | */ |
| 234 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 235 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 236 | PyObject *obj, /* Object */ |
Marc-André Lemburg | 5a5c81a | 2000-07-07 13:46:42 +0000 | [diff] [blame] | 237 | const char *encoding, /* encoding */ |
| 238 | const char *errors /* error handling */ |
| 239 | ); |
| 240 | |
Martin Panter | 20d3255 | 2016-04-15 00:56:21 +0000 | [diff] [blame] | 241 | /* Copy an instance of a Unicode subtype to a new true Unicode object if |
| 242 | necessary. If obj is already a true Unicode object (not a subtype), return |
| 243 | the reference with *incremented* refcount. |
Marc-André Lemburg | 5a5c81a | 2000-07-07 13:46:42 +0000 | [diff] [blame] | 244 | |
| 245 | The API returns NULL in case of an error. The caller is responsible |
| 246 | for decref'ing the returned objects. |
| 247 | |
| 248 | */ |
| 249 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 250 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 251 | PyObject *obj /* Object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 252 | ); |
| 253 | |
Victor Stinner | 1205f27 | 2010-09-11 00:54:47 +0000 | [diff] [blame] | 254 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( |
| 255 | const char *format, /* ASCII-encoded string */ |
| 256 | va_list vargs |
| 257 | ); |
| 258 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( |
| 259 | const char *format, /* ASCII-encoded string */ |
| 260 | ... |
| 261 | ); |
Walter Dörwald | d203431 | 2007-05-18 16:29:38 +0000 | [diff] [blame] | 262 | |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 263 | PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); |
| 264 | PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 265 | PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( |
| 266 | const char *u /* UTF-8 encoded string */ |
| 267 | ); |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 268 | |
| 269 | /* Use only if you know it's a string */ |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 270 | #define PyUnicode_CHECK_INTERNED(op) \ |
| 271 | (((PyASCIIObject *)(op))->state.interned) |
Walter Dörwald | 1680713 | 2007-05-25 13:52:07 +0000 | [diff] [blame] | 272 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 273 | /* --- wchar_t support for platforms which support it --------------------- */ |
| 274 | |
| 275 | #ifdef HAVE_WCHAR_H |
| 276 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 277 | /* Create a Unicode Object from the wchar_t buffer w of the given |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 278 | size. |
| 279 | |
| 280 | The buffer is copied into the new object. */ |
| 281 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 282 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 283 | const wchar_t *w, /* wchar_t buffer */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 284 | Py_ssize_t size /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 285 | ); |
| 286 | |
Marc-André Lemburg | a9cadcd | 2004-11-22 13:02:31 +0000 | [diff] [blame] | 287 | /* Copies the Unicode Object contents into the wchar_t buffer w. At |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 288 | most size wchar_t characters are copied. |
| 289 | |
Marc-André Lemburg | a9cadcd | 2004-11-22 13:02:31 +0000 | [diff] [blame] | 290 | Note that the resulting wchar_t string may or may not be |
| 291 | 0-terminated. It is the responsibility of the caller to make sure |
| 292 | that the wchar_t string is 0-terminated in case this is required by |
| 293 | the application. |
| 294 | |
| 295 | Returns the number of wchar_t characters copied (excluding a |
| 296 | possibly trailing 0-termination character) or -1 in case of an |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 297 | error. */ |
| 298 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 299 | PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( |
Martin v. Löwis | 4d0d471 | 2010-12-03 20:14:31 +0000 | [diff] [blame] | 300 | PyObject *unicode, /* Unicode object */ |
Antoine Pitrou | 9ed5f27 | 2013-08-13 20:18:52 +0200 | [diff] [blame] | 301 | wchar_t *w, /* wchar_t buffer */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 302 | Py_ssize_t size /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 303 | ); |
| 304 | |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 305 | /* Convert the Unicode object to a wide character string. The output string |
| 306 | always ends with a nul character. If size is not NULL, write the number of |
Victor Stinner | d88d983 | 2011-09-06 02:00:05 +0200 | [diff] [blame] | 307 | wide characters (excluding the null character) into *size. |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 308 | |
Victor Stinner | 22fabe2 | 2015-02-11 18:17:56 +0100 | [diff] [blame] | 309 | Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 310 | on success. On error, returns NULL, *size is undefined and raises a |
| 311 | MemoryError. */ |
| 312 | |
| 313 | PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( |
Victor Stinner | beb4135b | 2010-10-07 01:02:42 +0000 | [diff] [blame] | 314 | PyObject *unicode, /* Unicode object */ |
Victor Stinner | 137c34c | 2010-09-29 10:25:54 +0000 | [diff] [blame] | 315 | Py_ssize_t *size /* number of characters of the result */ |
| 316 | ); |
| 317 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 318 | #endif |
| 319 | |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 320 | /* --- Unicode ordinals --------------------------------------------------- */ |
| 321 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 322 | /* Create a Unicode Object from the given Unicode code point ordinal. |
| 323 | |
Ezio Melotti | e7f9037 | 2012-10-05 03:33:31 +0300 | [diff] [blame] | 324 | The ordinal must be in range(0x110000). A ValueError is |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 325 | raised in case it is not. |
| 326 | |
| 327 | */ |
| 328 | |
Marc-André Lemburg | 9c329de | 2002-08-12 08:19:10 +0000 | [diff] [blame] | 329 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); |
Marc-André Lemburg | cc8764c | 2002-08-11 12:23:04 +0000 | [diff] [blame] | 330 | |
Benjamin Peterson | 960cf0f | 2009-01-09 04:11:44 +0000 | [diff] [blame] | 331 | /* --- Free-list management ----------------------------------------------- */ |
| 332 | |
| 333 | /* Clear the free list used by the Unicode implementation. |
| 334 | |
| 335 | This can be used to release memory used for objects on the free |
| 336 | list back to the Python memory allocator. |
| 337 | |
| 338 | */ |
| 339 | |
| 340 | PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); |
| 341 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 342 | /* === Builtin Codecs ===================================================== |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 343 | |
| 344 | Many of these APIs take two arguments encoding and errors. These |
| 345 | parameters encoding and errors have the same semantics as the ones |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 346 | of the builtin str() API. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 347 | |
Georg Brandl | 952867a | 2010-06-27 10:17:12 +0000 | [diff] [blame] | 348 | Setting encoding to NULL causes the default encoding (UTF-8) to be used. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 349 | |
| 350 | Error handling is set by errors which may also be set to NULL |
| 351 | meaning to use the default handling defined for the codec. Default |
| 352 | error handling for all builtin codecs is "strict" (ValueErrors are |
| 353 | raised). |
| 354 | |
| 355 | The codecs all use a similar interface. Only deviation from the |
| 356 | generic ones are documented. |
| 357 | |
| 358 | */ |
| 359 | |
Fred Drake | cb093fe | 2000-05-09 19:51:53 +0000 | [diff] [blame] | 360 | /* --- Manage the default encoding ---------------------------------------- */ |
| 361 | |
Alexander Belopolsky | 83283c2 | 2010-11-16 14:29:01 +0000 | [diff] [blame] | 362 | /* Returns "utf-8". */ |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 363 | PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); |
Fred Drake | cb093fe | 2000-05-09 19:51:53 +0000 | [diff] [blame] | 364 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 365 | /* --- Generic Codecs ----------------------------------------------------- */ |
| 366 | |
| 367 | /* Create a Unicode object by decoding the encoded string s of the |
| 368 | given size. */ |
| 369 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 370 | PyAPI_FUNC(PyObject*) PyUnicode_Decode( |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 371 | const char *s, /* encoded string */ |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 372 | Py_ssize_t size, /* size of buffer */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 373 | const char *encoding, /* encoding */ |
| 374 | const char *errors /* error handling */ |
| 375 | ); |
| 376 | |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 377 | /* Decode a Unicode object unicode and return the result as Python |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 378 | object. |
| 379 | |
| 380 | This API is DEPRECATED. The only supported standard encoding is rot13. |
| 381 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
| 382 | that decode from str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 383 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 384 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 385 | PyObject *unicode, /* Unicode object */ |
| 386 | const char *encoding, /* encoding */ |
| 387 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 388 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 389 | |
| 390 | /* Decode a Unicode object unicode and return the result as Unicode |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 391 | object. |
| 392 | |
| 393 | This API is DEPRECATED. The only supported standard encoding is rot13. |
| 394 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
| 395 | that decode from str to str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 396 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 397 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 398 | PyObject *unicode, /* Unicode object */ |
| 399 | const char *encoding, /* encoding */ |
| 400 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 401 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 402 | |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 403 | /* Encodes a Unicode object and returns the result as Python |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 404 | object. |
| 405 | |
Ville Skyttä | 49b2734 | 2017-08-03 09:00:59 +0300 | [diff] [blame] | 406 | This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 407 | since all standard encodings (except rot13) encode str to bytes. |
| 408 | Use PyCodec_Encode() for encoding with rot13 and non-standard codecs |
| 409 | that encode form str to non-bytes. */ |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 410 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 411 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 412 | PyObject *unicode, /* Unicode object */ |
| 413 | const char *encoding, /* encoding */ |
| 414 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 415 | ); |
Marc-André Lemburg | d2d4598 | 2004-07-08 17:57:32 +0000 | [diff] [blame] | 416 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 417 | /* Encodes a Unicode object and returns the result as Python string |
| 418 | object. */ |
| 419 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 420 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 421 | PyObject *unicode, /* Unicode object */ |
| 422 | const char *encoding, /* encoding */ |
| 423 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 424 | ); |
| 425 | |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 426 | /* Encodes a Unicode object and returns the result as Unicode |
Serhiy Storchaka | 0093907 | 2016-10-27 21:05:49 +0300 | [diff] [blame] | 427 | object. |
| 428 | |
| 429 | This API is DEPRECATED. The only supported standard encodings is rot13. |
| 430 | Use PyCodec_Encode() to encode with rot13 and non-standard codecs |
| 431 | that encode from str to str. */ |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 432 | |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 433 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 434 | PyObject *unicode, /* Unicode object */ |
| 435 | const char *encoding, /* encoding */ |
| 436 | const char *errors /* error handling */ |
Zackery Spytz | 3c8724f | 2019-05-28 09:16:33 -0600 | [diff] [blame] | 437 | ); |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 438 | |
| 439 | /* Build an encoding map. */ |
| 440 | |
Thomas Wouters | 73e5a5b | 2006-06-08 15:35:45 +0000 | [diff] [blame] | 441 | PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( |
| 442 | PyObject* string /* 256 character map */ |
| 443 | ); |
| 444 | |
Marc-André Lemburg | c60e6f7 | 2001-09-20 10:35:46 +0000 | [diff] [blame] | 445 | /* --- UTF-7 Codecs ------------------------------------------------------- */ |
| 446 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 447 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 448 | const char *string, /* UTF-7 encoded string */ |
| 449 | Py_ssize_t length, /* size of string */ |
| 450 | const char *errors /* error handling */ |
Marc-André Lemburg | c60e6f7 | 2001-09-20 10:35:46 +0000 | [diff] [blame] | 451 | ); |
| 452 | |
Christian Heimes | 5d14c2b | 2007-11-20 23:38:09 +0000 | [diff] [blame] | 453 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 454 | const char *string, /* UTF-7 encoded string */ |
| 455 | Py_ssize_t length, /* size of string */ |
| 456 | const char *errors, /* error handling */ |
| 457 | Py_ssize_t *consumed /* bytes consumed */ |
Christian Heimes | 5d14c2b | 2007-11-20 23:38:09 +0000 | [diff] [blame] | 458 | ); |
| 459 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 460 | /* --- UTF-8 Codecs ------------------------------------------------------- */ |
| 461 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 462 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 463 | const char *string, /* UTF-8 encoded string */ |
| 464 | Py_ssize_t length, /* size of string */ |
| 465 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 466 | ); |
| 467 | |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 468 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 469 | const char *string, /* UTF-8 encoded string */ |
| 470 | Py_ssize_t length, /* size of string */ |
| 471 | const char *errors, /* error handling */ |
| 472 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 473 | ); |
| 474 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 475 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 476 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 477 | ); |
| 478 | |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 479 | /* --- UTF-32 Codecs ------------------------------------------------------ */ |
| 480 | |
| 481 | /* Decodes length bytes from a UTF-32 encoded buffer string and returns |
| 482 | the corresponding Unicode object. |
| 483 | |
| 484 | errors (if non-NULL) defines the error handling. It defaults |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 485 | to "strict". |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 486 | |
| 487 | If byteorder is non-NULL, the decoder starts decoding using the |
| 488 | given byte order: |
| 489 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 490 | *byteorder == -1: little endian |
| 491 | *byteorder == 0: native order |
| 492 | *byteorder == 1: big endian |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 493 | |
| 494 | In native mode, the first four bytes of the stream are checked for a |
| 495 | BOM mark. If found, the BOM mark is analysed, the byte order |
| 496 | adjusted and the BOM skipped. In the other modes, no BOM mark |
| 497 | interpretation is done. After completion, *byteorder is set to the |
| 498 | current byte order at the end of input data. |
| 499 | |
| 500 | If byteorder is NULL, the codec starts in native order mode. |
| 501 | |
| 502 | */ |
| 503 | |
| 504 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 505 | const char *string, /* UTF-32 encoded string */ |
| 506 | Py_ssize_t length, /* size of string */ |
| 507 | const char *errors, /* error handling */ |
| 508 | int *byteorder /* pointer to byteorder to use |
| 509 | 0=native;-1=LE,1=BE; updated on |
| 510 | exit */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 511 | ); |
| 512 | |
| 513 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 514 | const char *string, /* UTF-32 encoded string */ |
| 515 | Py_ssize_t length, /* size of string */ |
| 516 | const char *errors, /* error handling */ |
| 517 | int *byteorder, /* pointer to byteorder to use |
| 518 | 0=native;-1=LE,1=BE; updated on |
| 519 | exit */ |
| 520 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 521 | ); |
| 522 | |
| 523 | /* Returns a Python string using the UTF-32 encoding in native byte |
| 524 | order. The string always starts with a BOM mark. */ |
| 525 | |
| 526 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 527 | PyObject *unicode /* Unicode object */ |
Walter Dörwald | 41980ca | 2007-08-16 21:55:45 +0000 | [diff] [blame] | 528 | ); |
| 529 | |
| 530 | /* Returns a Python string object holding the UTF-32 encoded value of |
| 531 | the Unicode data. |
| 532 | |
| 533 | If byteorder is not 0, output is written according to the following |
| 534 | byte order: |
| 535 | |
| 536 | byteorder == -1: little endian |
| 537 | byteorder == 0: native byte order (writes a BOM mark) |
| 538 | byteorder == 1: big endian |
| 539 | |
| 540 | If byteorder is 0, the output string will always start with the |
| 541 | Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is |
| 542 | prepended. |
| 543 | |
| 544 | */ |
| 545 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 546 | /* --- UTF-16 Codecs ------------------------------------------------------ */ |
| 547 | |
Guido van Rossum | 9e896b3 | 2000-04-05 20:11:21 +0000 | [diff] [blame] | 548 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 549 | the corresponding Unicode object. |
| 550 | |
| 551 | errors (if non-NULL) defines the error handling. It defaults |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 552 | to "strict". |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 553 | |
| 554 | If byteorder is non-NULL, the decoder starts decoding using the |
| 555 | given byte order: |
| 556 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 557 | *byteorder == -1: little endian |
| 558 | *byteorder == 0: native order |
| 559 | *byteorder == 1: big endian |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 560 | |
Marc-André Lemburg | 489b56e | 2001-05-21 20:30:15 +0000 | [diff] [blame] | 561 | In native mode, the first two bytes of the stream are checked for a |
| 562 | BOM mark. If found, the BOM mark is analysed, the byte order |
| 563 | adjusted and the BOM skipped. In the other modes, no BOM mark |
| 564 | interpretation is done. After completion, *byteorder is set to the |
| 565 | current byte order at the end of input data. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 566 | |
| 567 | If byteorder is NULL, the codec starts in native order mode. |
| 568 | |
| 569 | */ |
| 570 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 571 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 572 | const char *string, /* UTF-16 encoded string */ |
| 573 | Py_ssize_t length, /* size of string */ |
| 574 | const char *errors, /* error handling */ |
| 575 | int *byteorder /* pointer to byteorder to use |
| 576 | 0=native;-1=LE,1=BE; updated on |
| 577 | exit */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 578 | ); |
| 579 | |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 580 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 581 | const char *string, /* UTF-16 encoded string */ |
| 582 | Py_ssize_t length, /* size of string */ |
| 583 | const char *errors, /* error handling */ |
| 584 | int *byteorder, /* pointer to byteorder to use |
| 585 | 0=native;-1=LE,1=BE; updated on |
| 586 | exit */ |
| 587 | Py_ssize_t *consumed /* bytes consumed */ |
Walter Dörwald | 6965203 | 2004-09-07 20:24:22 +0000 | [diff] [blame] | 588 | ); |
| 589 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 590 | /* Returns a Python string using the UTF-16 encoding in native byte |
| 591 | order. The string always starts with a BOM mark. */ |
| 592 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 593 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 594 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 595 | ); |
| 596 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 597 | /* --- Unicode-Escape Codecs ---------------------------------------------- */ |
| 598 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 599 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 600 | const char *string, /* Unicode-Escape encoded string */ |
| 601 | Py_ssize_t length, /* size of string */ |
| 602 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 603 | ); |
| 604 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 605 | PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 606 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 607 | ); |
| 608 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 609 | /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ |
| 610 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 611 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 612 | const char *string, /* Raw-Unicode-Escape encoded string */ |
| 613 | Py_ssize_t length, /* size of string */ |
| 614 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 615 | ); |
| 616 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 617 | PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 618 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 619 | ); |
| 620 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 621 | /* --- Latin-1 Codecs ----------------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 622 | |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 623 | Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 624 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 625 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 626 | const char *string, /* Latin-1 encoded string */ |
| 627 | Py_ssize_t length, /* size of string */ |
| 628 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 629 | ); |
| 630 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 631 | PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 632 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 633 | ); |
| 634 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 635 | /* --- ASCII Codecs ------------------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 636 | |
| 637 | Only 7-bit ASCII data is excepted. All other codes generate errors. |
| 638 | |
| 639 | */ |
| 640 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 641 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 642 | const char *string, /* ASCII encoded string */ |
| 643 | Py_ssize_t length, /* size of string */ |
| 644 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 645 | ); |
| 646 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 647 | PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 648 | PyObject *unicode /* Unicode object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 649 | ); |
| 650 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 651 | /* --- Character Map Codecs ----------------------------------------------- |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 652 | |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 653 | This codec uses mappings to encode and decode characters. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 654 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 655 | Decoding mappings must map byte ordinals (integers in the range from 0 to |
| 656 | 255) to Unicode strings, integers (which are then interpreted as Unicode |
| 657 | ordinals) or None. Unmapped data bytes (ones which cause a LookupError) |
| 658 | as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined |
| 659 | mapping" and cause an error. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 660 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 661 | Encoding mappings must map Unicode ordinal integers to bytes objects, |
| 662 | integers in the range from 0 to 255 or None. Unmapped character |
| 663 | ordinals (ones which cause a LookupError) as well as mapped to |
| 664 | None are treated as "undefined mapping" and cause an error. |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 665 | |
| 666 | */ |
| 667 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 668 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 669 | const char *string, /* Encoded string */ |
| 670 | Py_ssize_t length, /* size of string */ |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 671 | PyObject *mapping, /* decoding mapping */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 672 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 673 | ); |
| 674 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 675 | PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 676 | PyObject *unicode, /* Unicode object */ |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 677 | PyObject *mapping /* encoding mapping */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 678 | ); |
| 679 | |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 680 | /* --- MBCS codecs for Windows -------------------------------------------- */ |
Guido van Rossum | 24bdb04 | 2000-03-28 20:29:59 +0000 | [diff] [blame] | 681 | |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 682 | #ifdef MS_WINDOWS |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 683 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 684 | const char *string, /* MBCS encoded string */ |
Steve Dower | f5aba58 | 2016-09-06 19:42:27 -0700 | [diff] [blame] | 685 | Py_ssize_t length, /* size of string */ |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 686 | const char *errors /* error handling */ |
| 687 | ); |
| 688 | |
Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 689 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( |
| 690 | const char *string, /* MBCS encoded string */ |
| 691 | Py_ssize_t length, /* size of string */ |
| 692 | const char *errors, /* error handling */ |
| 693 | Py_ssize_t *consumed /* bytes consumed */ |
| 694 | ); |
| 695 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 696 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 697 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( |
| 698 | int code_page, /* code page number */ |
| 699 | const char *string, /* encoded string */ |
| 700 | Py_ssize_t length, /* size of string */ |
| 701 | const char *errors, /* error handling */ |
| 702 | Py_ssize_t *consumed /* bytes consumed */ |
| 703 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 704 | #endif |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 705 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 706 | PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( |
Guido van Rossum | efec115 | 2000-03-28 02:01:15 +0000 | [diff] [blame] | 707 | PyObject *unicode /* Unicode object */ |
| 708 | ); |
| 709 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 710 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 711 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( |
| 712 | int code_page, /* code page number */ |
| 713 | PyObject *unicode, /* Unicode object */ |
| 714 | const char *errors /* error handling */ |
| 715 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 716 | #endif |
Victor Stinner | 3a50e70 | 2011-10-18 21:21:00 +0200 | [diff] [blame] | 717 | |
Steve Dower | cc16be8 | 2016-09-08 10:35:16 -0700 | [diff] [blame] | 718 | #endif /* MS_WINDOWS */ |
Guido van Rossum | 24bdb04 | 2000-03-28 20:29:59 +0000 | [diff] [blame] | 719 | |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 720 | /* --- Locale encoding --------------------------------------------------- */ |
| 721 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 722 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 723 | /* Decode a string from the current locale encoding. The decoder is strict if |
| 724 | *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' |
| 725 | error handler (PEP 383) to escape undecodable bytes. If a byte sequence can |
| 726 | be decoded as a surrogate character and *surrogateescape* is not equal to |
| 727 | zero, the byte sequence is escaped using the 'surrogateescape' error handler |
| 728 | instead of being decoded. *str* must end with a null character but cannot |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 729 | contain embedded null characters. */ |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 730 | |
| 731 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( |
| 732 | const char *str, |
| 733 | Py_ssize_t len, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 734 | const char *errors); |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 735 | |
| 736 | /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string |
| 737 | length using strlen(). */ |
| 738 | |
| 739 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( |
| 740 | const char *str, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 741 | const char *errors); |
Victor Stinner | af02e1c | 2011-12-16 23:56:01 +0100 | [diff] [blame] | 742 | |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 743 | /* Encode a Unicode object to the current locale encoding. The encoder is |
| 744 | strict is *surrogateescape* is equal to zero, otherwise the |
| 745 | "surrogateescape" error handler is used. Return a bytes object. The string |
Victor Stinner | d45c7f8 | 2012-12-04 01:34:47 +0100 | [diff] [blame] | 746 | cannot contain embedded null characters. */ |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 747 | |
| 748 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( |
| 749 | PyObject *unicode, |
Victor Stinner | 1b57967 | 2011-12-17 05:47:23 +0100 | [diff] [blame] | 750 | const char *errors |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 751 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 752 | #endif |
Victor Stinner | f2ea71f | 2011-12-17 04:13:41 +0100 | [diff] [blame] | 753 | |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 754 | /* --- File system encoding ---------------------------------------------- */ |
| 755 | |
Victor Stinner | 47fcb5b | 2010-08-13 23:59:58 +0000 | [diff] [blame] | 756 | /* ParseTuple converter: encode str objects to bytes using |
| 757 | PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 758 | |
| 759 | PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); |
| 760 | |
Victor Stinner | 47fcb5b | 2010-08-13 23:59:58 +0000 | [diff] [blame] | 761 | /* ParseTuple converter: decode bytes objects to unicode using |
| 762 | PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ |
| 763 | |
| 764 | PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); |
| 765 | |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 766 | /* Decode a null-terminated string using Py_FileSystemDefaultEncoding |
| 767 | and the "surrogateescape" error handler. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 768 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 769 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 770 | encoding. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 771 | |
Benjamin Peterson | ccbd694 | 2010-05-15 17:43:18 +0000 | [diff] [blame] | 772 | Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 773 | */ |
| 774 | |
| 775 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( |
| 776 | const char *s /* encoded string */ |
| 777 | ); |
| 778 | |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 779 | /* Decode a string using Py_FileSystemDefaultEncoding |
| 780 | and the "surrogateescape" error handler. |
| 781 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 782 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 783 | encoding. |
Victor Stinner | 77c3862 | 2010-05-14 15:58:55 +0000 | [diff] [blame] | 784 | */ |
| 785 | |
Martin v. Löwis | 011e842 | 2009-05-05 04:43:17 +0000 | [diff] [blame] | 786 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( |
| 787 | const char *s, /* encoded string */ |
| 788 | Py_ssize_t size /* size */ |
| 789 | ); |
| 790 | |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 791 | /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the |
Benjamin Peterson | ccbd694 | 2010-05-15 17:43:18 +0000 | [diff] [blame] | 792 | "surrogateescape" error handler, and return bytes. |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 793 | |
Victor Stinner | f3170cc | 2010-10-15 12:04:23 +0000 | [diff] [blame] | 794 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
| 795 | encoding. |
Victor Stinner | ae6265f | 2010-05-15 16:27:27 +0000 | [diff] [blame] | 796 | */ |
| 797 | |
| 798 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( |
| 799 | PyObject *unicode |
| 800 | ); |
| 801 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 802 | /* --- Methods & Slots ---------------------------------------------------- |
| 803 | |
| 804 | These are capable of handling Unicode objects and strings on input |
| 805 | (we refer to them as strings in the descriptions) and return |
Georg Brandl | c6bc4c6 | 2011-10-05 16:23:09 +0200 | [diff] [blame] | 806 | Unicode objects or integers as appropriate. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 807 | |
| 808 | /* Concat two strings giving a new Unicode string. */ |
| 809 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 810 | PyAPI_FUNC(PyObject*) PyUnicode_Concat( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 811 | PyObject *left, /* Left string */ |
| 812 | PyObject *right /* Right string */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 813 | ); |
| 814 | |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 815 | /* Concat two strings and put the result in *pleft |
| 816 | (sets *pleft to NULL on error) */ |
| 817 | |
| 818 | PyAPI_FUNC(void) PyUnicode_Append( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 819 | PyObject **pleft, /* Pointer to left string */ |
| 820 | PyObject *right /* Right string */ |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 821 | ); |
| 822 | |
| 823 | /* Concat two strings, put the result in *pleft and drop the right object |
| 824 | (sets *pleft to NULL on error) */ |
| 825 | |
| 826 | PyAPI_FUNC(void) PyUnicode_AppendAndDel( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 827 | PyObject **pleft, /* Pointer to left string */ |
| 828 | PyObject *right /* Right string */ |
Walter Dörwald | 1ab8330 | 2007-05-18 17:15:44 +0000 | [diff] [blame] | 829 | ); |
| 830 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 831 | /* Split a string giving a list of Unicode strings. |
| 832 | |
| 833 | If sep is NULL, splitting will be done at all whitespace |
| 834 | substrings. Otherwise, splits occur at the given separator. |
| 835 | |
| 836 | At most maxsplit splits will be done. If negative, no limit is set. |
| 837 | |
| 838 | Separators are not included in the resulting list. |
| 839 | |
| 840 | */ |
| 841 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 842 | PyAPI_FUNC(PyObject*) PyUnicode_Split( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 843 | PyObject *s, /* String to split */ |
| 844 | PyObject *sep, /* String separator */ |
| 845 | Py_ssize_t maxsplit /* Maxsplit count */ |
| 846 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 847 | |
| 848 | /* Dito, but split at line breaks. |
| 849 | |
| 850 | CRLF is considered to be one line break. Line breaks are not |
| 851 | included in the resulting list. */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 852 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 853 | PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 854 | PyObject *s, /* String to split */ |
| 855 | int keepends /* If true, line end markers are included */ |
| 856 | ); |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 857 | |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 858 | /* Partition a string using a given separator. */ |
| 859 | |
| 860 | PyAPI_FUNC(PyObject*) PyUnicode_Partition( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 861 | PyObject *s, /* String to partition */ |
| 862 | PyObject *sep /* String separator */ |
| 863 | ); |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 864 | |
| 865 | /* Partition a string using a given separator, searching from the end of the |
| 866 | string. */ |
| 867 | |
| 868 | PyAPI_FUNC(PyObject*) PyUnicode_RPartition( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 869 | PyObject *s, /* String to partition */ |
| 870 | PyObject *sep /* String separator */ |
| 871 | ); |
Thomas Wouters | 477c8d5 | 2006-05-27 19:21:47 +0000 | [diff] [blame] | 872 | |
Hye-Shik Chang | 3ae811b | 2003-12-15 18:49:53 +0000 | [diff] [blame] | 873 | /* Split a string giving a list of Unicode strings. |
| 874 | |
| 875 | If sep is NULL, splitting will be done at all whitespace |
| 876 | substrings. Otherwise, splits occur at the given separator. |
| 877 | |
| 878 | At most maxsplit splits will be done. But unlike PyUnicode_Split |
| 879 | PyUnicode_RSplit splits from the end of the string. If negative, |
| 880 | no limit is set. |
| 881 | |
| 882 | Separators are not included in the resulting list. |
| 883 | |
| 884 | */ |
| 885 | |
| 886 | PyAPI_FUNC(PyObject*) PyUnicode_RSplit( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 887 | PyObject *s, /* String to split */ |
| 888 | PyObject *sep, /* String separator */ |
| 889 | Py_ssize_t maxsplit /* Maxsplit count */ |
| 890 | ); |
Hye-Shik Chang | 3ae811b | 2003-12-15 18:49:53 +0000 | [diff] [blame] | 891 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 892 | /* Translate a string by applying a character mapping table to it and |
| 893 | return the resulting Unicode object. |
| 894 | |
Serhiy Storchaka | c85a266 | 2017-03-19 08:15:17 +0200 | [diff] [blame] | 895 | The mapping table must map Unicode ordinal integers to Unicode strings, |
| 896 | Unicode ordinal integers or None (causing deletion of the character). |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 897 | |
| 898 | Mapping tables may be dictionaries or sequences. Unmapped character |
| 899 | ordinals (ones which cause a LookupError) are left untouched and |
| 900 | are copied as-is. |
| 901 | |
| 902 | */ |
| 903 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 904 | PyAPI_FUNC(PyObject *) PyUnicode_Translate( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 905 | PyObject *str, /* String */ |
| 906 | PyObject *table, /* Translate table */ |
| 907 | const char *errors /* error handling */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 908 | ); |
| 909 | |
| 910 | /* Join a sequence of strings using the given separator and return |
| 911 | the resulting Unicode string. */ |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 912 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 913 | PyAPI_FUNC(PyObject*) PyUnicode_Join( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 914 | PyObject *separator, /* Separator string */ |
| 915 | PyObject *seq /* Sequence object */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 916 | ); |
| 917 | |
| 918 | /* Return 1 if substr matches str[start:end] at the given tail end, 0 |
| 919 | otherwise. */ |
| 920 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 921 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 922 | PyObject *str, /* String */ |
| 923 | PyObject *substr, /* Prefix or Suffix string */ |
| 924 | Py_ssize_t start, /* Start index */ |
| 925 | Py_ssize_t end, /* Stop index */ |
| 926 | int direction /* Tail end: -1 prefix, +1 suffix */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 927 | ); |
| 928 | |
| 929 | /* Return the first position of substr in str[start:end] using the |
Marc-André Lemburg | 4da6fd6 | 2002-05-29 11:33:13 +0000 | [diff] [blame] | 930 | given search direction or -1 if not found. -2 is returned in case |
| 931 | an error occurred and an exception is set. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 932 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 933 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 934 | PyObject *str, /* String */ |
| 935 | PyObject *substr, /* Substring to find */ |
| 936 | Py_ssize_t start, /* Start index */ |
| 937 | Py_ssize_t end, /* Stop index */ |
| 938 | int direction /* Find direction: +1 forward, -1 backward */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 939 | ); |
| 940 | |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 941 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 942 | /* Like PyUnicode_Find, but search for single character only. */ |
| 943 | PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( |
| 944 | PyObject *str, |
| 945 | Py_UCS4 ch, |
| 946 | Py_ssize_t start, |
| 947 | Py_ssize_t end, |
| 948 | int direction |
| 949 | ); |
Serhiy Storchaka | 34d0ac8 | 2016-12-27 14:57:39 +0200 | [diff] [blame] | 950 | #endif |
Martin v. Löwis | d63a3b8 | 2011-09-28 07:41:54 +0200 | [diff] [blame] | 951 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 952 | /* Count the number of occurrences of substr in str[start:end]. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 953 | |
Martin v. Löwis | 18e1655 | 2006-02-15 17:27:45 +0000 | [diff] [blame] | 954 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 955 | PyObject *str, /* String */ |
| 956 | PyObject *substr, /* Substring to count */ |
| 957 | Py_ssize_t start, /* Start index */ |
| 958 | Py_ssize_t end /* Stop index */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 959 | ); |
| 960 | |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 961 | /* Replace at most maxcount occurrences of substr in str with replstr |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 962 | and return the resulting Unicode object. */ |
| 963 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 964 | PyAPI_FUNC(PyObject *) PyUnicode_Replace( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 965 | PyObject *str, /* String */ |
| 966 | PyObject *substr, /* Substring to find */ |
| 967 | PyObject *replstr, /* Substring to replace */ |
| 968 | Py_ssize_t maxcount /* Max. number of replacements to apply; |
| 969 | -1 = all */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 970 | ); |
| 971 | |
| 972 | /* Compare two strings and return -1, 0, 1 for less than, equal, |
Victor Stinner | 90db9c4 | 2012-10-04 21:53:50 +0200 | [diff] [blame] | 973 | greater than resp. |
| 974 | Raise an exception and return -1 on error. */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 975 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 976 | PyAPI_FUNC(int) PyUnicode_Compare( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 977 | PyObject *left, /* Left string */ |
| 978 | PyObject *right /* Right string */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 979 | ); |
| 980 | |
Serhiy Storchaka | f5894dd | 2016-11-16 15:40:39 +0200 | [diff] [blame] | 981 | /* Compare a Unicode object with C string and return -1, 0, 1 for less than, |
| 982 | equal, and greater than, respectively. It is best to pass only |
| 983 | ASCII-encoded strings, but the function interprets the input string as |
| 984 | ISO-8859-1 if it contains non-ASCII characters. |
Serhiy Storchaka | 419967b | 2016-12-06 00:13:34 +0200 | [diff] [blame] | 985 | This function does not raise exceptions. */ |
Serhiy Storchaka | f5894dd | 2016-11-16 15:40:39 +0200 | [diff] [blame] | 986 | |
Martin v. Löwis | 5b22213 | 2007-06-10 09:51:05 +0000 | [diff] [blame] | 987 | PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( |
| 988 | PyObject *left, |
Victor Stinner | dc2081f | 2010-12-27 01:49:29 +0000 | [diff] [blame] | 989 | const char *right /* ASCII-encoded string */ |
Martin v. Löwis | 5b22213 | 2007-06-10 09:51:05 +0000 | [diff] [blame] | 990 | ); |
| 991 | |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 992 | /* Rich compare two strings and return one of the following: |
| 993 | |
| 994 | - NULL in case an exception was raised |
Martin Panter | 69332c1 | 2016-08-04 13:07:31 +0000 | [diff] [blame] | 995 | - Py_True or Py_False for successful comparisons |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 996 | - Py_NotImplemented in case the type combination is unknown |
| 997 | |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 998 | Possible values for op: |
| 999 | |
| 1000 | Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE |
| 1001 | |
| 1002 | */ |
| 1003 | |
| 1004 | PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1005 | PyObject *left, /* Left string */ |
| 1006 | PyObject *right, /* Right string */ |
| 1007 | int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ |
Thomas Wouters | 00ee7ba | 2006-08-21 19:07:27 +0000 | [diff] [blame] | 1008 | ); |
| 1009 | |
Serhiy Storchaka | d65c949 | 2015-11-02 14:10:23 +0200 | [diff] [blame] | 1010 | /* Apply an argument tuple or dictionary to a format string and return |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1011 | the resulting Unicode string. */ |
| 1012 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 1013 | PyAPI_FUNC(PyObject *) PyUnicode_Format( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1014 | PyObject *format, /* Format string */ |
| 1015 | PyObject *args /* Argument tuple or dictionary */ |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1016 | ); |
| 1017 | |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1018 | /* Checks whether element is contained in container and return 1/0 |
| 1019 | accordingly. |
| 1020 | |
Martin Panter | cc71a79 | 2016-04-05 06:19:42 +0000 | [diff] [blame] | 1021 | element has to coerce to a one element Unicode string. -1 is |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1022 | returned in case of an error. */ |
| 1023 | |
Mark Hammond | 91a681d | 2002-08-12 07:21:58 +0000 | [diff] [blame] | 1024 | PyAPI_FUNC(int) PyUnicode_Contains( |
Antoine Pitrou | f95a1b3 | 2010-05-09 15:52:27 +0000 | [diff] [blame] | 1025 | PyObject *container, /* Container string */ |
| 1026 | PyObject *element /* Element string */ |
Guido van Rossum | d0d366b | 2000-03-13 23:22:24 +0000 | [diff] [blame] | 1027 | ); |
| 1028 | |
Martin v. Löwis | 4738340 | 2007-08-15 07:32:56 +0000 | [diff] [blame] | 1029 | /* Checks whether argument is a valid identifier. */ |
| 1030 | |
| 1031 | PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); |
| 1032 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1033 | /* === Characters Type APIs =============================================== */ |
| 1034 | |
Victor Stinner | fb9ea8c | 2011-10-06 01:45:57 +0200 | [diff] [blame] | 1035 | #if defined(Py_DEBUG) && !defined(Py_LIMITED_API) |
Victor Stinner | fb9ea8c | 2011-10-06 01:45:57 +0200 | [diff] [blame] | 1036 | PyAPI_FUNC(int) _PyUnicode_CheckConsistency( |
Victor Stinner | 7931d9a | 2011-11-04 00:22:48 +0100 | [diff] [blame] | 1037 | PyObject *op, |
Victor Stinner | fb9ea8c | 2011-10-06 01:45:57 +0200 | [diff] [blame] | 1038 | int check_content); |
T. Wouters | a00c3fd | 2017-03-31 09:14:41 -0700 | [diff] [blame] | 1039 | #elif !defined(NDEBUG) |
| 1040 | /* For asserts that call _PyUnicode_CheckConsistency(), which would |
| 1041 | * otherwise be a problem when building with asserts but without Py_DEBUG. */ |
| 1042 | #define _PyUnicode_CheckConsistency(op, check_content) PyUnicode_Check(op) |
Victor Stinner | fb9ea8c | 2011-10-06 01:45:57 +0200 | [diff] [blame] | 1043 | #endif |
| 1044 | |
Serhiy Storchaka | 9fab79b | 2016-09-11 11:03:14 +0300 | [diff] [blame] | 1045 | #ifndef Py_LIMITED_API |
Victor Stinner | 75e4699 | 2018-11-26 17:29:38 +0100 | [diff] [blame] | 1046 | # define Py_CPYTHON_UNICODEOBJECT_H |
| 1047 | # include "cpython/unicodeobject.h" |
| 1048 | # undef Py_CPYTHON_UNICODEOBJECT_H |
| 1049 | #endif |
Raymond Hettinger | ac2ef65 | 2015-07-04 16:04:44 -0700 | [diff] [blame] | 1050 | |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1051 | #ifdef __cplusplus |
| 1052 | } |
| 1053 | #endif |
Guido van Rossum | d822518 | 2000-03-10 22:33:05 +0000 | [diff] [blame] | 1054 | #endif /* !Py_UNICODEOBJECT_H */ |