blob: 0274de6733ab0e0ce2243aa530ae1815ca61480a [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020093typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000107# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000108#endif
109
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200110/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700112typedef uint32_t Py_UCS4;
113typedef uint16_t Py_UCS2;
114typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200115
Guido van Rossumd8225182000-03-10 22:33:05 +0000116/* --- Internal Unicode Operations ---------------------------------------- */
117
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000118/* Since splitting on whitespace is an important use case, and
119 whitespace in most situations is solely ASCII whitespace, we
120 optimize for the common case by using a quick look-up table
121 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000122
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000123 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000124#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000125#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000127
128#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
129#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
130#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
131#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
132
133#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
134#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
135#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
136
137#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
138#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
139#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000140#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000141
142#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
143#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
144#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
145
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000146#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000148#define Py_UNICODE_ISALNUM(ch) \
149 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 Py_UNICODE_ISDECIMAL(ch) || \
151 Py_UNICODE_ISDIGIT(ch) || \
152 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200154#define Py_UNICODE_COPY(target, source, length) \
Christian Heimesf051e432016-09-13 20:22:02 +0200155 memcpy((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000157#define Py_UNICODE_FILL(target, value, length) \
158 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Martin Panter6d57fe12016-09-17 03:26:16 +0000159 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000160 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300162/* macros to work with surrogates */
Victor Stinner76df43d2012-10-30 01:42:39 +0100163#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
164#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
165#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300166/* Join two surrogate characters and return a single Py_UCS4 value. */
167#define Py_UNICODE_JOIN_SURROGATES(high, low) \
168 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
169 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100170/* high surrogate = top 10 bits added to D800 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200171#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
Victor Stinner551ac952011-11-29 22:58:13 +0100172/* low surrogate = bottom 10 bits added to DC00 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200173#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300174
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000175/* Check if substring matches at given offset. The offset must be
176 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177
Thomas Wouters477c8d52006-05-27 19:21:47 +0000178#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200179 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
180 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
181 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
182
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000183#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000184
Barry Warsaw51ac5802000-03-20 16:36:48 +0000185#ifdef __cplusplus
186extern "C" {
187#endif
188
Guido van Rossumd8225182000-03-10 22:33:05 +0000189/* --- Unicode Type ------------------------------------------------------- */
190
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000191#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200192
193/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
194 structure. state.ascii and state.compact are set, and the data
195 immediately follow the structure. utf8_length and wstr_length can be found
196 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000197typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200198 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200199
200 - compact ascii:
201
202 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100203 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200204 * kind = PyUnicode_1BYTE_KIND
205 * compact = 1
206 * ascii = 1
207 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200208 * (length is the length of the utf8 and wstr strings)
209 * (data starts just after the structure)
210 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200211
212 - compact:
213
214 * structure = PyCompactUnicodeObject
Victor Stinner80bc72d2011-12-22 03:23:10 +0100215 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200216 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
217 PyUnicode_4BYTE_KIND
218 * compact = 1
219 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200220 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200221 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200222 * utf8_length = 0 if utf8 is NULL
223 * wstr is shared with data and wstr_length=length
224 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100225 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200226 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200227 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200228
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200229 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200230
231 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100232 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100233 * length = 0 (use wstr_length)
234 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200235 * kind = PyUnicode_WCHAR_KIND
236 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200237 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200238 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100239 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200240 * wstr is not NULL
241 * data.any is NULL
242 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200243 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200245 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200246
247 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100248 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
250 PyUnicode_4BYTE_KIND
251 * compact = 0
252 * ready = 1
253 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200254 * utf8 is shared and utf8_length = length with data.any if ascii = 1
255 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100256 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200257 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
258 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
259 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200260
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200261 Compact strings use only one memory block (structure + characters),
262 whereas legacy strings use one block for the structure and one block
263 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 Legacy strings are created by PyUnicode_FromUnicode() and
266 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
267 when PyUnicode_READY() is called.
268
269 See also _PyUnicode_CheckConsistency().
270 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000271 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000273 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 struct {
275 /*
276 SSTATE_NOT_INTERNED (0)
277 SSTATE_INTERNED_MORTAL (1)
278 SSTATE_INTERNED_IMMORTAL (2)
279
280 If interned != SSTATE_NOT_INTERNED, the two references from the
281 dictionary to this object are *not* counted in ob_refcnt.
282 */
283 unsigned int interned:2;
284 /* Character size:
285
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200286 - PyUnicode_WCHAR_KIND (0):
287
288 * character type = wchar_t (16 or 32 bits, depending on the
289 platform)
290
291 - PyUnicode_1BYTE_KIND (1):
292
293 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100294 * all characters are in the range U+0000-U+00FF (latin1)
295 * if ascii is set, all characters are in the range U+0000-U+007F
296 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200297 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200298
299 - PyUnicode_2BYTE_KIND (2):
300
301 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100302 * all characters are in the range U+0000-U+FFFF (BMP)
303 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200304
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200305 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200306
307 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100308 * all characters are in the range U+0000-U+10FFFF
309 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200310 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200311 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 /* Compact is with respect to the allocation scheme. Compact unicode
313 objects only require one memory block while non-compact objects use
314 one block for the PyUnicodeObject struct and another for its data
315 buffer. */
316 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100317 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200318 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
319 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 unsigned int ascii:1;
321 /* The ready flag indicates whether the object layout is initialized
322 completely. This means that this is either a compact object, or
323 the data pointer is filled out. The bit is redundant, and helps
324 to minimize the test in PyUnicode_IS_READY(). */
325 unsigned int ready:1;
Antoine Pitrou8c6f8dc2014-03-23 22:55:03 +0100326 /* Padding to ensure that PyUnicode_DATA() is always aligned to
327 4 bytes (see issue #19537 on m68k). */
328 unsigned int :24;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 } state;
330 wchar_t *wstr; /* wchar_t representation (null-terminated) */
331} PyASCIIObject;
332
333/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200334 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200335 immediately follow the structure. */
336typedef struct {
337 PyASCIIObject _base;
338 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
339 * terminating \0. */
340 char *utf8; /* UTF-8 representation (null-terminated) */
341 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
342 * surrogates count as two code points. */
343} PyCompactUnicodeObject;
344
345/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
346 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200347 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348typedef struct {
349 PyCompactUnicodeObject _base;
350 union {
351 void *any;
352 Py_UCS1 *latin1;
353 Py_UCS2 *ucs2;
354 Py_UCS4 *ucs4;
355 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000356} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000357#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000358
Mark Hammond91a681d2002-08-12 07:21:58 +0000359PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000360PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000361
Thomas Wouters27d517b2007-02-25 20:39:11 +0000362#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000363 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
364#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000365
366/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000367#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368
369#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200370 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371 ((PyASCIIObject*)op)->length : \
372 ((PyCompactUnicodeObject*)op)->wstr_length)
373
374/* Returns the deprecated Py_UNICODE representation's size in code units
375 (this includes surrogate pairs as 2 units).
376 If the Py_UNICODE representation is not available, it will be computed
377 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
378
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100379#define PyUnicode_GET_SIZE(op) \
380 (assert(PyUnicode_Check(op)), \
381 (((PyASCIIObject *)(op))->wstr) ? \
382 PyUnicode_WSTR_LENGTH(op) : \
383 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
384 assert(((PyASCIIObject *)(op))->wstr), \
385 PyUnicode_WSTR_LENGTH(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200386 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200387
Guido van Rossumd8225182000-03-10 22:33:05 +0000388#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200389 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200390 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200391
392/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
393 representation on demand. Using this macro is very inefficient now,
394 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
395 use PyUnicode_WRITE() and PyUnicode_READ(). */
396
Guido van Rossumd8225182000-03-10 22:33:05 +0000397#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398 (assert(PyUnicode_Check(op)), \
399 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
400 PyUnicode_AsUnicode((PyObject *)(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200401 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402
Guido van Rossumd8225182000-03-10 22:33:05 +0000403#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 ((const char *)(PyUnicode_AS_UNICODE(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200405 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406
407
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200408/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409
Victor Stinner6f9568b2011-11-17 00:12:44 +0100410/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
412/* Interning state. */
413#define SSTATE_NOT_INTERNED 0
414#define SSTATE_INTERNED_MORTAL 1
415#define SSTATE_INTERNED_IMMORTAL 2
416
Victor Stinnera3b334d2011-10-03 13:53:37 +0200417/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100418 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
419 ready. */
420#define PyUnicode_IS_ASCII(op) \
421 (assert(PyUnicode_Check(op)), \
422 assert(PyUnicode_IS_READY(op)), \
423 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200424
425/* Return true if the string is compact or 0 if not.
426 No type checks or Ready calls are performed. */
427#define PyUnicode_IS_COMPACT(op) \
428 (((PyASCIIObject*)(op))->state.compact)
429
430/* Return true if the string is a compact ASCII string (use PyASCIIObject
431 structure), or 0 if not. No type checks or Ready calls are performed. */
432#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100433 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200434
Victor Stinner52e2cc82011-12-19 22:14:45 +0100435enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200437 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200438 has not been called yet. */
Victor Stinner52e2cc82011-12-19 22:14:45 +0100439 PyUnicode_WCHAR_KIND = 0,
440/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200441 PyUnicode_1BYTE_KIND = 1,
442 PyUnicode_2BYTE_KIND = 2,
443 PyUnicode_4BYTE_KIND = 4
444};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445
Georg Brandl4975a9b2011-10-05 16:12:21 +0200446/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200448 No checks are performed, use PyUnicode_KIND() before to ensure
449 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define PyUnicode_KIND(op) \
457 (assert(PyUnicode_Check(op)), \
458 assert(PyUnicode_IS_READY(op)), \
459 ((PyASCIIObject *)(op))->state.kind)
460
Victor Stinner157f83f2011-09-28 21:41:31 +0200461/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200463 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464 ((void*)((PyASCIIObject*)(op) + 1)) : \
465 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op) \
468 (assert(((PyUnicodeObject*)(op))->data.any), \
469 ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472 (assert(PyUnicode_Check(op)), \
473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
474 _PyUnicode_NONCOMPACT_DATA(op))
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476/* In the access macros below, "kind" may be evaluated more than once.
477 All other macro parameters are evaluated exactly once, so it is safe
478 to put side effects into them (such as increasing the index). */
479
480/* Write into the canonical representation, this macro does not do any sanity
481 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200482 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200483 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200484 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200485#define PyUnicode_WRITE(kind, data, index, value) \
486 do { \
487 switch ((kind)) { \
488 case PyUnicode_1BYTE_KIND: { \
489 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
490 break; \
491 } \
492 case PyUnicode_2BYTE_KIND: { \
493 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
494 break; \
495 } \
496 default: { \
497 assert((kind) == PyUnicode_4BYTE_KIND); \
498 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
499 } \
500 } \
501 } while (0)
502
Georg Brandl07de3252011-10-05 16:47:38 +0200503/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 or ready calls are performed. */
505#define PyUnicode_READ(kind, data, index) \
506 ((Py_UCS4) \
507 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200508 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200509 ((kind) == PyUnicode_2BYTE_KIND ? \
510 ((const Py_UCS2 *)(data))[(index)] : \
511 ((const Py_UCS4 *)(data))[(index)] \
512 ) \
513 ))
514
515/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
516 calls PyUnicode_KIND() and might call it twice. For single reads, use
517 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
518 cache kind and use PyUnicode_READ instead. */
519#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200520 (assert(PyUnicode_Check(unicode)), \
521 assert(PyUnicode_IS_READY(unicode)), \
522 (Py_UCS4) \
523 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
524 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
525 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
526 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
527 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
528 ) \
529 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530
531/* Returns the length of the unicode string. The caller has to make sure that
532 the string has it's canonical representation set before calling
533 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
534#define PyUnicode_GET_LENGTH(op) \
535 (assert(PyUnicode_Check(op)), \
536 assert(PyUnicode_IS_READY(op)), \
537 ((PyASCIIObject *)(op))->length)
538
539
540/* Fast check to determine whether an object is ready. Equivalent to
541 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
542
543#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
544
Victor Stinnera3b334d2011-10-03 13:53:37 +0200545/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200547 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548 Returns 0 on success and -1 on errors. */
549#define PyUnicode_READY(op) \
550 (assert(PyUnicode_Check(op)), \
551 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200552 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200554/* Return a maximum character value which is suitable for creating another
555 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200556 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557#define PyUnicode_MAX_CHAR_VALUE(op) \
558 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200559 (PyUnicode_IS_ASCII(op) ? \
560 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200562 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200563 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200564 (0xffffU) : \
565 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000567#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000568
569/* --- Constants ---------------------------------------------------------- */
570
571/* This Unicode character will be used as replacement character during
572 decoding if the errors argument is set to "replace". Note: the
573 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
574 Unicode 3.0. */
575
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200576#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000577
578/* === Public API ========================================================= */
579
580/* --- Plain Py_UNICODE --------------------------------------------------- */
581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582/* With PEP 393, this is the recommended way to allocate a new unicode object.
583 This function will allocate the object and its buffer in a single memory
584 block. Objects created using this function are not resizable. */
585#ifndef Py_LIMITED_API
586PyAPI_FUNC(PyObject*) PyUnicode_New(
587 Py_ssize_t size, /* Number of code points in the new string */
588 Py_UCS4 maxchar /* maximum code point value in the string */
589 );
590#endif
591
Benjamin Peterson82f34ad2015-01-13 09:17:24 -0500592/* Initializes the canonical string representation from the deprecated
Victor Stinnerd8f65102011-09-29 19:43:17 +0200593 wstr/Py_UNICODE representation. This function is used to convert Unicode
594 objects which were created using the old API to the new flexible format
595 introduced with PEP 393.
596
597 Don't call this function directly, use the public PyUnicode_READY() macro
598 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200599#ifndef Py_LIMITED_API
600PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200601 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602 );
603#endif
604
Victor Stinner034f6cf2011-09-30 02:26:44 +0200605/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100606#ifndef Py_LIMITED_API
607PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200608 PyObject *unicode
609 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100610#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612/* Copy character from one unicode object into another, this function performs
Victor Stinner3fe55312012-01-04 00:33:50 +0100613 character conversion when necessary and falls back to memcpy() if possible.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614
Victor Stinner3fe55312012-01-04 00:33:50 +0100615 Fail if to is too small (smaller than *how_many* or smaller than
Victor Stinnera0702ab2011-09-29 14:14:38 +0200616 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
Victor Stinner3fe55312012-01-04 00:33:50 +0100617 kind(to), or if *to* has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618
619 Return the number of written character, or return -1 and raise an exception
620 on error.
621
622 Pseudo-code:
623
624 how_many = min(how_many, len(from) - from_start)
625 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
626 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200627
628 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200629 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200631PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632 PyObject *to,
633 Py_ssize_t to_start,
634 PyObject *from,
635 Py_ssize_t from_start,
636 Py_ssize_t how_many
637 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200638
639/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
640 may crash if parameters are invalid (e.g. if the output string
641 is too short). */
642PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
643 PyObject *to,
644 Py_ssize_t to_start,
645 PyObject *from,
646 Py_ssize_t from_start,
647 Py_ssize_t how_many
648 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649#endif
650
Victor Stinnerd3f08822012-05-29 12:57:52 +0200651#ifndef Py_LIMITED_API
Victor Stinner3fe55312012-01-04 00:33:50 +0100652/* Fill a string with a character: write fill_char into
653 unicode[start:start+length].
654
655 Fail if fill_char is bigger than the string maximum character, or if the
656 string has more than 1 reference.
657
658 Return the number of written character, or return -1 and raise an exception
659 on error. */
Victor Stinner3fe55312012-01-04 00:33:50 +0100660PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
661 PyObject *unicode,
662 Py_ssize_t start,
663 Py_ssize_t length,
664 Py_UCS4 fill_char
665 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200666
667/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
668 if parameters are invalid (e.g. if length is longer than the string). */
669PyAPI_FUNC(void) _PyUnicode_FastFill(
670 PyObject *unicode,
671 Py_ssize_t start,
672 Py_ssize_t length,
673 Py_UCS4 fill_char
674 );
Victor Stinner3fe55312012-01-04 00:33:50 +0100675#endif
676
Guido van Rossumd8225182000-03-10 22:33:05 +0000677/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000679
680 u may be NULL which causes the contents to be undefined. It is the
681 user's responsibility to fill in the needed data afterwards. Note
682 that modifying the Unicode object contents after construction is
683 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000684
685 The buffer is copied into the new object. */
686
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000687#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000688PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000689 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000690 Py_ssize_t size /* size of buffer */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200691 ) /* Py_DEPRECATED(3.3) */;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000692#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000693
Georg Brandl952867a2010-06-27 10:17:12 +0000694/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000695PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000696 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000697 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000698 );
699
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000700/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000702PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000703 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000704 );
705
Victor Stinnerd3f08822012-05-29 12:57:52 +0200706#ifndef Py_LIMITED_API
Victor Stinnerb9275c12011-10-05 14:01:42 +0200707/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
708 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
710 int kind,
711 const void *buffer,
712 Py_ssize_t size);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200713
714/* Create a new string from a buffer of ASCII characters.
715 WARNING: Don't check if the string contains any non-ASCII character. */
716PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
717 const char *buffer,
718 Py_ssize_t size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719#endif
720
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200721#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722PyAPI_FUNC(PyObject*) PyUnicode_Substring(
723 PyObject *str,
724 Py_ssize_t start,
725 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200726#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727
Victor Stinnerece58de2012-04-23 23:36:38 +0200728#ifndef Py_LIMITED_API
729/* Compute the maximum character of the substring unicode[start:end].
730 Return 127 for an empty string. */
731PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
732 PyObject *unicode,
733 Py_ssize_t start,
734 Py_ssize_t end);
735#endif
736
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200737#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200738/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300739 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 the buffer is smaller than the string. Return buffer on success.
741
742 buflen is the length of the buffer in (Py_UCS4) characters. */
743PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
744 PyObject *unicode,
745 Py_UCS4* buffer,
746 Py_ssize_t buflen,
747 int copy_null);
748
749/* Copy the string into a UCS4 buffer. A new buffer is allocated using
750 * PyMem_Malloc; if this fails, NULL is returned with a memory error
751 exception set. */
752PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200753#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +0300755#ifndef Py_LIMITED_API
Guido van Rossumd8225182000-03-10 22:33:05 +0000756/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757 Py_UNICODE buffer.
758 If the wchar_t/Py_UNICODE representation is not yet available, this
759 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000760
Mark Hammond91a681d2002-08-12 07:21:58 +0000761PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 PyObject *unicode /* Unicode object */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200763 ) /* Py_DEPRECATED(3.3) */;
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +0300764
765/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
766 contains null characters. */
767PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
768 PyObject *unicode /* Unicode object */
769 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771/* Return a read-only pointer to the Unicode object's internal
772 Py_UNICODE buffer and save the length at size.
773 If the wchar_t/Py_UNICODE representation is not yet available, this
774 function will calculate it. */
775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200776PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
777 PyObject *unicode, /* Unicode object */
778 Py_ssize_t *size /* location where to save the length */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200779 ) /* Py_DEPRECATED(3.3) */;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780#endif
781
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200782#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000783/* Get the length of the Unicode object. */
784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
786 PyObject *unicode
787);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200788#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789
Victor Stinner157f83f2011-09-28 21:41:31 +0200790/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200791 string representation. */
792
Martin v. Löwis18e16552006-02-15 17:27:45 +0000793PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 PyObject *unicode /* Unicode object */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200795 ) Py_DEPRECATED(3.3);
Guido van Rossumd8225182000-03-10 22:33:05 +0000796
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200797#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798/* Read a character from the string. */
799
800PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
801 PyObject *unicode,
802 Py_ssize_t index
803 );
804
805/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200806 PyUnicode_New, must not be shared, and must not have been hashed yet.
807
808 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
810PyAPI_FUNC(int) PyUnicode_WriteChar(
811 PyObject *unicode,
812 Py_ssize_t index,
813 Py_UCS4 character
814 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200815#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000817#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000818/* Get the maximum ordinal for a Unicode character. */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200819PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000820#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000821
Martin Panter6245cb32016-04-15 02:14:19 +0000822/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100823 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
824 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000825
826 *unicode is modified to point to the new (resized) object and 0
827 returned on success.
828
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100829 Try to resize the string in place (which is usually faster than allocating
830 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000831
832 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100833 is returned and *unicode left untouched.
834
835 WARNING: The function doesn't check string content, the result may not be a
836 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000837
Mark Hammond91a681d2002-08-12 07:21:58 +0000838PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 PyObject **unicode, /* Pointer to the Unicode object */
840 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000841 );
842
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300843/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000844
Martin Panter20d32552016-04-15 00:56:21 +0000845 bytes, bytearray and other bytes-like objects are decoded according to the
846 given encoding and error handler. The encoding and error handler can be
847 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000848
Martin Panter20d32552016-04-15 00:56:21 +0000849 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000850
851 The API returns NULL in case of an error. The caller is responsible
852 for decref'ing the returned objects.
853
854*/
855
Mark Hammond91a681d2002-08-12 07:21:58 +0000856PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200857 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000858 const char *encoding, /* encoding */
859 const char *errors /* error handling */
860 );
861
Martin Panter20d32552016-04-15 00:56:21 +0000862/* Copy an instance of a Unicode subtype to a new true Unicode object if
863 necessary. If obj is already a true Unicode object (not a subtype), return
864 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000865
866 The API returns NULL in case of an error. The caller is responsible
867 for decref'ing the returned objects.
868
869*/
870
Mark Hammond91a681d2002-08-12 07:21:58 +0000871PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200872 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000873 );
874
Victor Stinner1205f272010-09-11 00:54:47 +0000875PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
876 const char *format, /* ASCII-encoded string */
877 va_list vargs
878 );
879PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
880 const char *format, /* ASCII-encoded string */
881 ...
882 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000883
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000884#ifndef Py_LIMITED_API
Victor Stinnerd3f08822012-05-29 12:57:52 +0200885typedef struct {
886 PyObject *buffer;
887 void *data;
888 enum PyUnicode_Kind kind;
889 Py_UCS4 maxchar;
890 Py_ssize_t size;
891 Py_ssize_t pos;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200892
893 /* minimum number of allocated characters (default: 0) */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200894 Py_ssize_t min_length;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200895
896 /* minimum character (default: 127, ASCII) */
897 Py_UCS4 min_char;
898
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200899 /* If non-zero, overallocate the buffer (default: 0). */
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200900 unsigned char overallocate;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200901
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200902 /* If readonly is 1, buffer is a shared string (cannot be modified)
903 and size is set to 0. */
904 unsigned char readonly;
Victor Stinnerd3f08822012-05-29 12:57:52 +0200905} _PyUnicodeWriter ;
906
907/* Initialize a Unicode writer.
Victor Stinner8f674cc2013-04-17 23:02:17 +0200908 *
909 * By default, the minimum buffer size is 0 character and overallocation is
910 * disabled. Set min_length, min_char and overallocate attributes to control
911 * the allocation of the buffer. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200912PyAPI_FUNC(void)
Victor Stinner8f674cc2013-04-17 23:02:17 +0200913_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200914
915/* Prepare the buffer to write 'length' characters
916 with the specified maximum character.
917
918 Return 0 on success, raise an exception and return -1 on error. */
919#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
920 (((MAXCHAR) <= (WRITER)->maxchar \
921 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
922 ? 0 \
923 : (((LENGTH) == 0) \
924 ? 0 \
925 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
926
927/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
928 instead. */
929PyAPI_FUNC(int)
930_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
931 Py_ssize_t length, Py_UCS4 maxchar);
932
Victor Stinnerca9381e2015-09-22 00:58:32 +0200933/* Prepare the buffer to have at least the kind KIND.
934 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
935 support characters in range U+000-U+FFFF.
936
937 Return 0 on success, raise an exception and return -1 on error. */
938#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
939 (assert((KIND) != PyUnicode_WCHAR_KIND), \
940 (KIND) <= (WRITER)->kind \
941 ? 0 \
942 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
943
944/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
945 macro instead. */
946PyAPI_FUNC(int)
947_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
948 enum PyUnicode_Kind kind);
949
Victor Stinnera0dd0212013-04-11 22:09:04 +0200950/* Append a Unicode character.
951 Return 0 on success, raise an exception and return -1 on error. */
952PyAPI_FUNC(int)
953_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
954 Py_UCS4 ch
955 );
956
Victor Stinnere215d962012-10-06 23:03:36 +0200957/* Append a Unicode string.
958 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200959PyAPI_FUNC(int)
Victor Stinnere215d962012-10-06 23:03:36 +0200960_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
961 PyObject *str /* Unicode string */
962 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200963
Victor Stinnercfc4c132013-04-03 01:48:39 +0200964/* Append a substring of a Unicode string.
965 Return 0 on success, raise an exception and return -1 on error. */
966PyAPI_FUNC(int)
967_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
968 PyObject *str, /* Unicode string */
969 Py_ssize_t start,
970 Py_ssize_t end
971 );
972
Serhiy Storchakad65c9492015-11-02 14:10:23 +0200973/* Append an ASCII-encoded byte string.
Victor Stinner4a587072013-11-19 12:54:53 +0100974 Return 0 on success, raise an exception and return -1 on error. */
975PyAPI_FUNC(int)
976_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
977 const char *str, /* ASCII-encoded byte string */
978 Py_ssize_t len /* number of bytes, or -1 if unknown */
979 );
980
Victor Stinnere215d962012-10-06 23:03:36 +0200981/* Append a latin1-encoded byte string.
982 Return 0 on success, raise an exception and return -1 on error. */
983PyAPI_FUNC(int)
Victor Stinner4a587072013-11-19 12:54:53 +0100984_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
985 const char *str, /* latin1-encoded byte string */
986 Py_ssize_t len /* length in bytes */
Victor Stinnere215d962012-10-06 23:03:36 +0200987 );
988
Martin Panter6245cb32016-04-15 02:14:19 +0000989/* Get the value of the writer as a Unicode string. Clear the
Victor Stinnere215d962012-10-06 23:03:36 +0200990 buffer of the writer. Raise an exception and return NULL
991 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200992PyAPI_FUNC(PyObject *)
993_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
994
Victor Stinnere215d962012-10-06 23:03:36 +0200995/* Deallocate memory of a writer (clear its internal buffer). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200996PyAPI_FUNC(void)
997_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
998#endif
999
1000#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +00001001/* Format the object based on the format_spec, as defined in PEP 3101
1002 (Advanced String Formatting). */
Victor Stinnerd3f08822012-05-29 12:57:52 +02001003PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
1004 _PyUnicodeWriter *writer,
1005 PyObject *obj,
1006 PyObject *format_spec,
1007 Py_ssize_t start,
1008 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001009#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +00001010
Walter Dörwald16807132007-05-25 13:52:07 +00001011PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1012PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001013PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1014 const char *u /* UTF-8 encoded string */
1015 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001016#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +00001017PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001018#endif
Walter Dörwald16807132007-05-25 13:52:07 +00001019
1020/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021#define PyUnicode_CHECK_INTERNED(op) \
1022 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +00001023
Guido van Rossumd8225182000-03-10 22:33:05 +00001024/* --- wchar_t support for platforms which support it --------------------- */
1025
1026#ifdef HAVE_WCHAR_H
1027
Georg Brandl952867a2010-06-27 10:17:12 +00001028/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +00001029 size.
1030
1031 The buffer is copied into the new object. */
1032
Mark Hammond91a681d2002-08-12 07:21:58 +00001033PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001034 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001035 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 );
1037
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001038/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +00001039 most size wchar_t characters are copied.
1040
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041 Note that the resulting wchar_t string may or may not be
1042 0-terminated. It is the responsibility of the caller to make sure
1043 that the wchar_t string is 0-terminated in case this is required by
1044 the application.
1045
1046 Returns the number of wchar_t characters copied (excluding a
1047 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +00001048 error. */
1049
Martin v. Löwis18e16552006-02-15 17:27:45 +00001050PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001051 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001052 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001053 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001054 );
1055
Victor Stinner137c34c2010-09-29 10:25:54 +00001056/* Convert the Unicode object to a wide character string. The output string
1057 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +02001058 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +00001059
Victor Stinner22fabe22015-02-11 18:17:56 +01001060 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +00001061 on success. On error, returns NULL, *size is undefined and raises a
1062 MemoryError. */
1063
1064PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001065 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +00001066 Py_ssize_t *size /* number of characters of the result */
1067 );
1068
Victor Stinner9f789e72011-10-01 03:57:28 +02001069#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +02001071#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072
Guido van Rossumd8225182000-03-10 22:33:05 +00001073#endif
1074
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075/* --- Unicode ordinals --------------------------------------------------- */
1076
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077/* Create a Unicode Object from the given Unicode code point ordinal.
1078
Ezio Melottie7f90372012-10-05 03:33:31 +03001079 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001080 raised in case it is not.
1081
1082*/
1083
Marc-André Lemburg9c329de2002-08-12 08:19:10 +00001084PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001085
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001086/* --- Free-list management ----------------------------------------------- */
1087
1088/* Clear the free list used by the Unicode implementation.
1089
1090 This can be used to release memory used for objects on the free
1091 list back to the Python memory allocator.
1092
1093*/
1094
1095PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1096
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +00001098
1099 Many of these APIs take two arguments encoding and errors. These
1100 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001101 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +00001102
Georg Brandl952867a2010-06-27 10:17:12 +00001103 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +00001104
1105 Error handling is set by errors which may also be set to NULL
1106 meaning to use the default handling defined for the codec. Default
1107 error handling for all builtin codecs is "strict" (ValueErrors are
1108 raised).
1109
1110 The codecs all use a similar interface. Only deviation from the
1111 generic ones are documented.
1112
1113*/
1114
Fred Drakecb093fe2000-05-09 19:51:53 +00001115/* --- Manage the default encoding ---------------------------------------- */
1116
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001117/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001118 Unicode object unicode and the size of the encoded representation
1119 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +00001120
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001121 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001122
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001123 This function caches the UTF-8 encoded string in the unicodeobject
1124 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 when the unicodeobject is deallocated.
1126
1127 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1128 support the previous internal function with the same behaviour.
1129
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001130 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001131 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001132
1133 *** If you need to access the Unicode object as UTF-8 bytes string,
1134 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +00001135*/
1136
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001137#ifndef Py_LIMITED_API
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02001138PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001139 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001140 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001142#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001143
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001144/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001145 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1148 in the unicodeobject.
1149
1150 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1151 support the previous internal function with the same behaviour.
1152
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001153 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001154 extracted from the returned data.
1155
1156 *** This API is for interpreter INTERNAL USE ONLY and will likely
1157 *** be removed or changed for Python 3.1.
1158
1159 *** If you need to access the Unicode object as UTF-8 bytes string,
1160 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001161
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001162*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001163
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001164#ifndef Py_LIMITED_API
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02001165PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001167#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001168
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001169/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001170
Mark Hammond91a681d2002-08-12 07:21:58 +00001171PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001172
Guido van Rossumd8225182000-03-10 22:33:05 +00001173/* --- Generic Codecs ----------------------------------------------------- */
1174
1175/* Create a Unicode object by decoding the encoded string s of the
1176 given size. */
1177
Mark Hammond91a681d2002-08-12 07:21:58 +00001178PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001179 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001180 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001181 const char *encoding, /* encoding */
1182 const char *errors /* error handling */
1183 );
1184
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001185/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +03001186 object.
1187
1188 This API is DEPRECATED. The only supported standard encoding is rot13.
1189 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1190 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001191
1192PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 PyObject *unicode, /* Unicode object */
1194 const char *encoding, /* encoding */
1195 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001196 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001197
1198/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +03001199 object.
1200
1201 This API is DEPRECATED. The only supported standard encoding is rot13.
1202 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1203 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001204
1205PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 PyObject *unicode, /* Unicode object */
1207 const char *encoding, /* encoding */
1208 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001209 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001210
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001212 Python string object. */
1213
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001214#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001215PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001216 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001218 const char *encoding, /* encoding */
1219 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001220 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001221#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001222
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001223/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +03001224 object.
1225
Ville Skyttä49b27342017-08-03 09:00:59 +03001226 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
Serhiy Storchaka00939072016-10-27 21:05:49 +03001227 since all standard encodings (except rot13) encode str to bytes.
1228 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
1229 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001230
1231PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001232 PyObject *unicode, /* Unicode object */
1233 const char *encoding, /* encoding */
1234 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001235 ) Py_DEPRECATED(3.6);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001236
Guido van Rossumd8225182000-03-10 22:33:05 +00001237/* Encodes a Unicode object and returns the result as Python string
1238 object. */
1239
Mark Hammond91a681d2002-08-12 07:21:58 +00001240PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001241 PyObject *unicode, /* Unicode object */
1242 const char *encoding, /* encoding */
1243 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001244 );
1245
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001246/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +03001247 object.
1248
1249 This API is DEPRECATED. The only supported standard encodings is rot13.
1250 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
1251 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001252
1253PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 PyObject *unicode, /* Unicode object */
1255 const char *encoding, /* encoding */
1256 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001257 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001258
1259/* Build an encoding map. */
1260
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001261PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1262 PyObject* string /* 256 character map */
1263 );
1264
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001265/* --- UTF-7 Codecs ------------------------------------------------------- */
1266
Mark Hammond91a681d2002-08-12 07:21:58 +00001267PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 const char *string, /* UTF-7 encoded string */
1269 Py_ssize_t length, /* size of string */
1270 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001271 );
1272
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001273PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 const char *string, /* UTF-7 encoded string */
1275 Py_ssize_t length, /* size of string */
1276 const char *errors, /* error handling */
1277 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001278 );
1279
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001280#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001281PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 const Py_UNICODE *data, /* Unicode char buffer */
1283 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1284 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1285 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1286 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001287 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001288PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1289 PyObject *unicode, /* Unicode object */
1290 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1291 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1292 const char *errors /* error handling */
1293 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001294#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001295
Guido van Rossumd8225182000-03-10 22:33:05 +00001296/* --- UTF-8 Codecs ------------------------------------------------------- */
1297
Mark Hammond91a681d2002-08-12 07:21:58 +00001298PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 const char *string, /* UTF-8 encoded string */
1300 Py_ssize_t length, /* size of string */
1301 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 );
1303
Walter Dörwald69652032004-09-07 20:24:22 +00001304PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 const char *string, /* UTF-8 encoded string */
1306 Py_ssize_t length, /* size of string */
1307 const char *errors, /* error handling */
1308 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001309 );
1310
Mark Hammond91a681d2002-08-12 07:21:58 +00001311PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001313 );
1314
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001315#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1317 PyObject *unicode,
1318 const char *errors);
1319
Mark Hammond91a681d2002-08-12 07:21:58 +00001320PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 const Py_UNICODE *data, /* Unicode char buffer */
1322 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1323 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001324 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001325#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001326
Walter Dörwald41980ca2007-08-16 21:55:45 +00001327/* --- UTF-32 Codecs ------------------------------------------------------ */
1328
1329/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1330 the corresponding Unicode object.
1331
1332 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001334
1335 If byteorder is non-NULL, the decoder starts decoding using the
1336 given byte order:
1337
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 *byteorder == -1: little endian
1339 *byteorder == 0: native order
1340 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001341
1342 In native mode, the first four bytes of the stream are checked for a
1343 BOM mark. If found, the BOM mark is analysed, the byte order
1344 adjusted and the BOM skipped. In the other modes, no BOM mark
1345 interpretation is done. After completion, *byteorder is set to the
1346 current byte order at the end of input data.
1347
1348 If byteorder is NULL, the codec starts in native order mode.
1349
1350*/
1351
1352PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001353 const char *string, /* UTF-32 encoded string */
1354 Py_ssize_t length, /* size of string */
1355 const char *errors, /* error handling */
1356 int *byteorder /* pointer to byteorder to use
1357 0=native;-1=LE,1=BE; updated on
1358 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001359 );
1360
1361PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 const char *string, /* UTF-32 encoded string */
1363 Py_ssize_t length, /* size of string */
1364 const char *errors, /* error handling */
1365 int *byteorder, /* pointer to byteorder to use
1366 0=native;-1=LE,1=BE; updated on
1367 exit */
1368 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001369 );
1370
1371/* Returns a Python string using the UTF-32 encoding in native byte
1372 order. The string always starts with a BOM mark. */
1373
1374PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001376 );
1377
1378/* Returns a Python string object holding the UTF-32 encoded value of
1379 the Unicode data.
1380
1381 If byteorder is not 0, output is written according to the following
1382 byte order:
1383
1384 byteorder == -1: little endian
1385 byteorder == 0: native byte order (writes a BOM mark)
1386 byteorder == 1: big endian
1387
1388 If byteorder is 0, the output string will always start with the
1389 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1390 prepended.
1391
1392*/
1393
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001394#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001395PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 const Py_UNICODE *data, /* Unicode char buffer */
1397 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1398 const char *errors, /* error handling */
1399 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001400 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001401PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1402 PyObject *object, /* Unicode object */
1403 const char *errors, /* error handling */
1404 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1405 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001406#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001407
Guido van Rossumd8225182000-03-10 22:33:05 +00001408/* --- UTF-16 Codecs ------------------------------------------------------ */
1409
Guido van Rossum9e896b32000-04-05 20:11:21 +00001410/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001411 the corresponding Unicode object.
1412
1413 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001415
1416 If byteorder is non-NULL, the decoder starts decoding using the
1417 given byte order:
1418
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419 *byteorder == -1: little endian
1420 *byteorder == 0: native order
1421 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001422
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001423 In native mode, the first two bytes of the stream are checked for a
1424 BOM mark. If found, the BOM mark is analysed, the byte order
1425 adjusted and the BOM skipped. In the other modes, no BOM mark
1426 interpretation is done. After completion, *byteorder is set to the
1427 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001428
1429 If byteorder is NULL, the codec starts in native order mode.
1430
1431*/
1432
Mark Hammond91a681d2002-08-12 07:21:58 +00001433PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001434 const char *string, /* UTF-16 encoded string */
1435 Py_ssize_t length, /* size of string */
1436 const char *errors, /* error handling */
1437 int *byteorder /* pointer to byteorder to use
1438 0=native;-1=LE,1=BE; updated on
1439 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001440 );
1441
Walter Dörwald69652032004-09-07 20:24:22 +00001442PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 const char *string, /* UTF-16 encoded string */
1444 Py_ssize_t length, /* size of string */
1445 const char *errors, /* error handling */
1446 int *byteorder, /* pointer to byteorder to use
1447 0=native;-1=LE,1=BE; updated on
1448 exit */
1449 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001450 );
1451
Guido van Rossumd8225182000-03-10 22:33:05 +00001452/* Returns a Python string using the UTF-16 encoding in native byte
1453 order. The string always starts with a BOM mark. */
1454
Mark Hammond91a681d2002-08-12 07:21:58 +00001455PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001457 );
1458
1459/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001460 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001461
1462 If byteorder is not 0, output is written according to the following
1463 byte order:
1464
1465 byteorder == -1: little endian
1466 byteorder == 0: native byte order (writes a BOM mark)
1467 byteorder == 1: big endian
1468
1469 If byteorder is 0, the output string will always start with the
1470 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1471 prepended.
1472
1473 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1474 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001475 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001476
1477*/
1478
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001479#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001480PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 const Py_UNICODE *data, /* Unicode char buffer */
1482 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1483 const char *errors, /* error handling */
1484 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001485 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001486PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1487 PyObject* unicode, /* Unicode object */
1488 const char *errors, /* error handling */
1489 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1490 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001491#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001492
1493/* --- Unicode-Escape Codecs ---------------------------------------------- */
1494
Mark Hammond91a681d2002-08-12 07:21:58 +00001495PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 const char *string, /* Unicode-Escape encoded string */
1497 Py_ssize_t length, /* size of string */
1498 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001499 );
1500
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001501#ifndef Py_LIMITED_API
Eric V. Smith56466482016-10-31 14:46:26 -04001502/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
1503 chars. */
1504PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
1505 const char *string, /* Unicode-Escape encoded string */
1506 Py_ssize_t length, /* size of string */
1507 const char *errors, /* error handling */
1508 const char **first_invalid_escape /* on return, points to first
1509 invalid escaped char in
1510 string. */
1511);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001512#endif
Eric V. Smith56466482016-10-31 14:46:26 -04001513
Mark Hammond91a681d2002-08-12 07:21:58 +00001514PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001516 );
1517
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001518#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001519PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 const Py_UNICODE *data, /* Unicode char buffer */
1521 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001522 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001523#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001524
1525/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1526
Mark Hammond91a681d2002-08-12 07:21:58 +00001527PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 const char *string, /* Raw-Unicode-Escape encoded string */
1529 Py_ssize_t length, /* size of string */
1530 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001531 );
1532
Mark Hammond91a681d2002-08-12 07:21:58 +00001533PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001535 );
1536
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001537#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001538PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 const Py_UNICODE *data, /* Unicode char buffer */
1540 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001541 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001542#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001543
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001544/* --- Unicode Internal Codec ---------------------------------------------
1545
1546 Only for internal use in _codecsmodule.c */
1547
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001548#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001549PyObject *_PyUnicode_DecodeUnicodeInternal(
1550 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001551 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001552 const char *errors
1553 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001554#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001555
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001556/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001557
1558 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1559
1560*/
1561
Mark Hammond91a681d2002-08-12 07:21:58 +00001562PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 const char *string, /* Latin-1 encoded string */
1564 Py_ssize_t length, /* size of string */
1565 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001566 );
1567
Mark Hammond91a681d2002-08-12 07:21:58 +00001568PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001569 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001570 );
1571
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001572#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1574 PyObject* unicode,
1575 const char* errors);
1576
Mark Hammond91a681d2002-08-12 07:21:58 +00001577PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001578 const Py_UNICODE *data, /* Unicode char buffer */
1579 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1580 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001581 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001582#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001583
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001585
1586 Only 7-bit ASCII data is excepted. All other codes generate errors.
1587
1588*/
1589
Mark Hammond91a681d2002-08-12 07:21:58 +00001590PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 const char *string, /* ASCII encoded string */
1592 Py_ssize_t length, /* size of string */
1593 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001594 );
1595
Mark Hammond91a681d2002-08-12 07:21:58 +00001596PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001597 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001598 );
1599
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001600#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1602 PyObject* unicode,
1603 const char* errors);
1604
Mark Hammond91a681d2002-08-12 07:21:58 +00001605PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606 const Py_UNICODE *data, /* Unicode char buffer */
1607 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1608 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001609 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001610#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001611
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001613
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001614 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001615
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001616 Decoding mappings must map byte ordinals (integers in the range from 0 to
1617 255) to Unicode strings, integers (which are then interpreted as Unicode
1618 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
1619 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
1620 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +00001621
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001622 Encoding mappings must map Unicode ordinal integers to bytes objects,
1623 integers in the range from 0 to 255 or None. Unmapped character
1624 ordinals (ones which cause a LookupError) as well as mapped to
1625 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +00001626
1627*/
1628
Mark Hammond91a681d2002-08-12 07:21:58 +00001629PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001630 const char *string, /* Encoded string */
1631 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001632 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001634 );
1635
Mark Hammond91a681d2002-08-12 07:21:58 +00001636PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001638 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +00001639 );
1640
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001641#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001642PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 const Py_UNICODE *data, /* Unicode char buffer */
1644 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001645 PyObject *mapping, /* encoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001647 ) Py_DEPRECATED(3.3);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001648PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1649 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001650 PyObject *mapping, /* encoding mapping */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001651 const char *errors /* error handling */
1652 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001653#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001654
1655/* Translate a Py_UNICODE buffer of the given length by applying a
1656 character mapping table to it and return the resulting Unicode
1657 object.
1658
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001659 The mapping table must map Unicode ordinal integers to Unicode strings,
1660 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001661
1662 Mapping tables may be dictionaries or sequences. Unmapped character
1663 ordinals (ones which cause a LookupError) are left untouched and
1664 are copied as-is.
1665
1666*/
1667
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001668#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001669PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 const Py_UNICODE *data, /* Unicode char buffer */
1671 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1672 PyObject *table, /* Translate table */
1673 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001674 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001675#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001676
Steve Dowercc16be82016-09-08 10:35:16 -07001677#ifdef MS_WINDOWS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001678
Guido van Rossumefec1152000-03-28 02:01:15 +00001679/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001680
Mark Hammond91a681d2002-08-12 07:21:58 +00001681PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001682 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -07001683 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001684 const char *errors /* error handling */
1685 );
1686
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001687PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1688 const char *string, /* MBCS encoded string */
1689 Py_ssize_t length, /* size of string */
1690 const char *errors, /* error handling */
1691 Py_ssize_t *consumed /* bytes consumed */
1692 );
1693
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001694#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +02001695PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1696 int code_page, /* code page number */
1697 const char *string, /* encoded string */
1698 Py_ssize_t length, /* size of string */
1699 const char *errors, /* error handling */
1700 Py_ssize_t *consumed /* bytes consumed */
1701 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001702#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02001703
Mark Hammond91a681d2002-08-12 07:21:58 +00001704PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001705 PyObject *unicode /* Unicode object */
1706 );
1707
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001708#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001709PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001710 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001711 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001712 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001713 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001714#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001715
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001716#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +02001717PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1718 int code_page, /* code page number */
1719 PyObject *unicode, /* Unicode object */
1720 const char *errors /* error handling */
1721 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001722#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02001723
Steve Dowercc16be82016-09-08 10:35:16 -07001724#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001725
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02001726#ifndef Py_LIMITED_API
Guido van Rossum9e896b32000-04-05 20:11:21 +00001727/* --- Decimal Encoder ---------------------------------------------------- */
1728
1729/* Takes a Unicode string holding a decimal value and writes it into
1730 an output buffer using standard ASCII digit codes.
1731
1732 The output buffer has to provide at least length+1 bytes of storage
1733 area. The output string is 0-terminated.
1734
1735 The encoder converts whitespace to ' ', decimal characters to their
1736 corresponding ASCII digit and all other Latin-1 characters except
1737 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1738 are treated as errors. This includes embedded NULL bytes.
1739
1740 Error handling is defined by the errors argument:
1741
1742 NULL or "strict": raise a ValueError
1743 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001745 "replace": replaces illegal characters with '?'
1746
1747 Returns 0 on success, -1 on failure.
1748
1749*/
1750
Mark Hammond91a681d2002-08-12 07:21:58 +00001751PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 Py_UNICODE *s, /* Unicode buffer */
1753 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1754 char *output, /* Output buffer; must have size >= length */
1755 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001756 ) /* Py_DEPRECATED(3.3) */;
Guido van Rossum9e896b32000-04-05 20:11:21 +00001757
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001758/* Transforms code points that have decimal digit property to the
1759 corresponding ASCII digit code points.
1760
1761 Returns a new Unicode string on success, NULL on failure.
1762*/
1763
1764PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1765 Py_UNICODE *s, /* Unicode buffer */
1766 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001767 ) /* Py_DEPRECATED(3.3) */;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001768
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02001769/* Coverts a Unicode object holding a decimal value to an ASCII string
1770 for using in int, float and complex parsers.
1771 Transforms code points that have decimal digit property to the
1772 corresponding ASCII digit code points. Transforms spaces to ASCII.
1773 Transforms code points starting from the first non-ASCII code point that
1774 is neither a decimal digit nor a space to the end into '?'. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1777 PyObject *unicode /* Unicode object */
1778 );
1779#endif
1780
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001781/* --- Locale encoding --------------------------------------------------- */
1782
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001783#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001784/* Decode a string from the current locale encoding. The decoder is strict if
1785 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1786 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1787 be decoded as a surrogate character and *surrogateescape* is not equal to
1788 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1789 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001790 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001791
1792PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1793 const char *str,
1794 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001795 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001796
1797/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1798 length using strlen(). */
1799
1800PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1801 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001802 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001803
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001804/* Encode a Unicode object to the current locale encoding. The encoder is
1805 strict is *surrogateescape* is equal to zero, otherwise the
1806 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +01001807 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001808
1809PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1810 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001811 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001812 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001813#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001814
Martin v. Löwis011e8422009-05-05 04:43:17 +00001815/* --- File system encoding ---------------------------------------------- */
1816
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001817/* ParseTuple converter: encode str objects to bytes using
1818 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001819
1820PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1821
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001822/* ParseTuple converter: decode bytes objects to unicode using
1823 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1824
1825PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1826
Victor Stinner77c38622010-05-14 15:58:55 +00001827/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1828 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001829
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001830 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1831 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001832
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001833 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001834*/
1835
1836PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1837 const char *s /* encoded string */
1838 );
1839
Victor Stinner77c38622010-05-14 15:58:55 +00001840/* Decode a string using Py_FileSystemDefaultEncoding
1841 and the "surrogateescape" error handler.
1842
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001843 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1844 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001845*/
1846
Martin v. Löwis011e8422009-05-05 04:43:17 +00001847PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1848 const char *s, /* encoded string */
1849 Py_ssize_t size /* size */
1850 );
1851
Victor Stinnerae6265f2010-05-15 16:27:27 +00001852/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001853 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001854
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1856 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001857*/
1858
1859PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1860 PyObject *unicode
1861 );
1862
Guido van Rossumd8225182000-03-10 22:33:05 +00001863/* --- Methods & Slots ----------------------------------------------------
1864
1865 These are capable of handling Unicode objects and strings on input
1866 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001867 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001868
1869/* Concat two strings giving a new Unicode string. */
1870
Mark Hammond91a681d2002-08-12 07:21:58 +00001871PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001872 PyObject *left, /* Left string */
1873 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001874 );
1875
Walter Dörwald1ab83302007-05-18 17:15:44 +00001876/* Concat two strings and put the result in *pleft
1877 (sets *pleft to NULL on error) */
1878
1879PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001880 PyObject **pleft, /* Pointer to left string */
1881 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001882 );
1883
1884/* Concat two strings, put the result in *pleft and drop the right object
1885 (sets *pleft to NULL on error) */
1886
1887PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001888 PyObject **pleft, /* Pointer to left string */
1889 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001890 );
1891
Guido van Rossumd8225182000-03-10 22:33:05 +00001892/* Split a string giving a list of Unicode strings.
1893
1894 If sep is NULL, splitting will be done at all whitespace
1895 substrings. Otherwise, splits occur at the given separator.
1896
1897 At most maxsplit splits will be done. If negative, no limit is set.
1898
1899 Separators are not included in the resulting list.
1900
1901*/
1902
Mark Hammond91a681d2002-08-12 07:21:58 +00001903PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001904 PyObject *s, /* String to split */
1905 PyObject *sep, /* String separator */
1906 Py_ssize_t maxsplit /* Maxsplit count */
1907 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001908
1909/* Dito, but split at line breaks.
1910
1911 CRLF is considered to be one line break. Line breaks are not
1912 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001913
Mark Hammond91a681d2002-08-12 07:21:58 +00001914PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001915 PyObject *s, /* String to split */
1916 int keepends /* If true, line end markers are included */
1917 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001918
Thomas Wouters477c8d52006-05-27 19:21:47 +00001919/* Partition a string using a given separator. */
1920
1921PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001922 PyObject *s, /* String to partition */
1923 PyObject *sep /* String separator */
1924 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001925
1926/* Partition a string using a given separator, searching from the end of the
1927 string. */
1928
1929PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001930 PyObject *s, /* String to partition */
1931 PyObject *sep /* String separator */
1932 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001933
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001934/* Split a string giving a list of Unicode strings.
1935
1936 If sep is NULL, splitting will be done at all whitespace
1937 substrings. Otherwise, splits occur at the given separator.
1938
1939 At most maxsplit splits will be done. But unlike PyUnicode_Split
1940 PyUnicode_RSplit splits from the end of the string. If negative,
1941 no limit is set.
1942
1943 Separators are not included in the resulting list.
1944
1945*/
1946
1947PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001948 PyObject *s, /* String to split */
1949 PyObject *sep, /* String separator */
1950 Py_ssize_t maxsplit /* Maxsplit count */
1951 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001952
Guido van Rossumd8225182000-03-10 22:33:05 +00001953/* Translate a string by applying a character mapping table to it and
1954 return the resulting Unicode object.
1955
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001956 The mapping table must map Unicode ordinal integers to Unicode strings,
1957 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001958
1959 Mapping tables may be dictionaries or sequences. Unmapped character
1960 ordinals (ones which cause a LookupError) are left untouched and
1961 are copied as-is.
1962
1963*/
1964
Mark Hammond91a681d2002-08-12 07:21:58 +00001965PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001966 PyObject *str, /* String */
1967 PyObject *table, /* Translate table */
1968 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001969 );
1970
1971/* Join a sequence of strings using the given separator and return
1972 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001973
Mark Hammond91a681d2002-08-12 07:21:58 +00001974PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001975 PyObject *separator, /* Separator string */
1976 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001977 );
1978
Serhiy Storchakaea525a22016-09-06 22:07:53 +03001979#ifndef Py_LIMITED_API
1980PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1981 PyObject *separator,
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02001982 PyObject *const *items,
Serhiy Storchakaea525a22016-09-06 22:07:53 +03001983 Py_ssize_t seqlen
1984 );
1985#endif /* Py_LIMITED_API */
1986
Guido van Rossumd8225182000-03-10 22:33:05 +00001987/* Return 1 if substr matches str[start:end] at the given tail end, 0
1988 otherwise. */
1989
Martin v. Löwis18e16552006-02-15 17:27:45 +00001990PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001991 PyObject *str, /* String */
1992 PyObject *substr, /* Prefix or Suffix string */
1993 Py_ssize_t start, /* Start index */
1994 Py_ssize_t end, /* Stop index */
1995 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001996 );
1997
1998/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001999 given search direction or -1 if not found. -2 is returned in case
2000 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002001
Martin v. Löwis18e16552006-02-15 17:27:45 +00002002PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002003 PyObject *str, /* String */
2004 PyObject *substr, /* Substring to find */
2005 Py_ssize_t start, /* Start index */
2006 Py_ssize_t end, /* Stop index */
2007 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00002008 );
2009
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02002010#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011/* Like PyUnicode_Find, but search for single character only. */
2012PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
2013 PyObject *str,
2014 Py_UCS4 ch,
2015 Py_ssize_t start,
2016 Py_ssize_t end,
2017 int direction
2018 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02002019#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020
Barry Warsaw51ac5802000-03-20 16:36:48 +00002021/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002022
Martin v. Löwis18e16552006-02-15 17:27:45 +00002023PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002024 PyObject *str, /* String */
2025 PyObject *substr, /* Substring to count */
2026 Py_ssize_t start, /* Start index */
2027 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00002028 );
2029
Barry Warsaw51ac5802000-03-20 16:36:48 +00002030/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00002031 and return the resulting Unicode object. */
2032
Mark Hammond91a681d2002-08-12 07:21:58 +00002033PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002034 PyObject *str, /* String */
2035 PyObject *substr, /* Substring to find */
2036 PyObject *replstr, /* Substring to replace */
2037 Py_ssize_t maxcount /* Max. number of replacements to apply;
2038 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00002039 );
2040
2041/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +02002042 greater than resp.
2043 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002044
Mark Hammond91a681d2002-08-12 07:21:58 +00002045PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002046 PyObject *left, /* Left string */
2047 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00002048 );
2049
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002050#ifndef Py_LIMITED_API
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002051/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +02002052 0 otherwise. The right argument must be ASCII identifier.
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002053 Any error occurs inside will be cleared before return. */
2054
2055PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
2056 PyObject *left, /* Left string */
2057 _Py_Identifier *right /* Right identifier */
2058 );
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002059#endif
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002060
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002061/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
2062 equal, and greater than, respectively. It is best to pass only
2063 ASCII-encoded strings, but the function interprets the input string as
2064 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +02002065 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002066
Martin v. Löwis5b222132007-06-10 09:51:05 +00002067PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2068 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00002069 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00002070 );
2071
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +02002072#ifndef Py_LIMITED_API
2073/* Test whether a unicode is equal to ASCII string. Return 1 if true,
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +02002074 0 otherwise. The right argument must be ASCII-encoded string.
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +02002075 Any error occurs inside will be cleared before return. */
2076
2077PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
2078 PyObject *left,
2079 const char *right /* ASCII-encoded string */
2080 );
2081#endif
2082
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002083/* Rich compare two strings and return one of the following:
2084
2085 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +00002086 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002087 - Py_NotImplemented in case the type combination is unknown
2088
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002089 Possible values for op:
2090
2091 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2092
2093*/
2094
2095PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002096 PyObject *left, /* Left string */
2097 PyObject *right, /* Right string */
2098 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002099 );
2100
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002101/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00002102 the resulting Unicode string. */
2103
Mark Hammond91a681d2002-08-12 07:21:58 +00002104PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002105 PyObject *format, /* Format string */
2106 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00002107 );
2108
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002109/* Checks whether element is contained in container and return 1/0
2110 accordingly.
2111
Martin Pantercc71a792016-04-05 06:19:42 +00002112 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002113 returned in case of an error. */
2114
Mark Hammond91a681d2002-08-12 07:21:58 +00002115PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002116 PyObject *container, /* Container string */
2117 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002118 );
2119
Martin v. Löwis47383402007-08-15 07:32:56 +00002120/* Checks whether argument is a valid identifier. */
2121
2122PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2123
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002124#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002125/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00002126PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002127 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002128 int striptype,
2129 PyObject *sepobj
2130 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002131#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002132
Eric Smitha3b1ac82009-04-03 14:45:06 +00002133/* Using explicit passed-in values, insert the thousands grouping
2134 into the string pointed to by buffer. For the argument descriptions,
2135 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002136#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02002138 PyObject *unicode,
Victor Stinner41a863c2012-02-24 00:37:51 +01002139 Py_ssize_t index,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 Py_ssize_t n_buffer,
2141 void *digits,
2142 Py_ssize_t n_digits,
2143 Py_ssize_t min_width,
2144 const char *grouping,
Victor Stinner41a863c2012-02-24 00:37:51 +01002145 PyObject *thousands_sep,
2146 Py_UCS4 *maxchar);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002147#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002148/* === Characters Type APIs =============================================== */
2149
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002150/* Helper array used by Py_UNICODE_ISSPACE(). */
2151
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002152#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002153PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2154
Guido van Rossumd8225182000-03-10 22:33:05 +00002155/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002156 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00002157
2158 These APIs are implemented in Objects/unicodectype.c.
2159
2160*/
2161
Mark Hammond91a681d2002-08-12 07:21:58 +00002162PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002163 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002164 );
2165
Mark Hammond91a681d2002-08-12 07:21:58 +00002166PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002167 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002168 );
2169
Mark Hammond91a681d2002-08-12 07:21:58 +00002170PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002171 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002172 );
2173
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002174PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002175 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002176 );
2177
2178PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002179 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002180 );
2181
Mark Hammond91a681d2002-08-12 07:21:58 +00002182PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002183 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002184 );
2185
Mark Hammond91a681d2002-08-12 07:21:58 +00002186PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002187 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002188 );
2189
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002190PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2191 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002192 ) /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +00002193
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002194PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2195 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002196 ) /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +00002197
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002198PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2199 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002200 ) Py_DEPRECATED(3.3);
Guido van Rossumd8225182000-03-10 22:33:05 +00002201
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002202PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2203 Py_UCS4 ch, /* Unicode character */
2204 Py_UCS4 *res
2205 );
2206
2207PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2208 Py_UCS4 ch, /* Unicode character */
2209 Py_UCS4 *res
2210 );
2211
2212PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2213 Py_UCS4 ch, /* Unicode character */
2214 Py_UCS4 *res
2215 );
2216
Benjamin Petersond5890c82012-01-14 13:23:30 -05002217PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2218 Py_UCS4 ch, /* Unicode character */
2219 Py_UCS4 *res
2220 );
2221
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002222PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002223 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002224 );
2225
2226PyAPI_FUNC(int) _PyUnicode_IsCased(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002227 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002228 );
2229
Mark Hammond91a681d2002-08-12 07:21:58 +00002230PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002231 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002232 );
2233
Mark Hammond91a681d2002-08-12 07:21:58 +00002234PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002235 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002236 );
2237
Mark Hammond91a681d2002-08-12 07:21:58 +00002238PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002239 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002240 );
2241
Mark Hammond91a681d2002-08-12 07:21:58 +00002242PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002243 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002244 );
2245
Mark Hammond91a681d2002-08-12 07:21:58 +00002246PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002247 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002248 );
2249
Mark Hammond91a681d2002-08-12 07:21:58 +00002250PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002251 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002252 );
2253
Georg Brandl559e5d72008-06-11 18:37:52 +00002254PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002255 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002256 );
2257
Mark Hammond91a681d2002-08-12 07:21:58 +00002258PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002259 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002260 );
2261
Victor Stinneref8d95c2010-08-16 22:03:11 +00002262PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2263 const Py_UNICODE *u
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002264 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002265
2266PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002267 Py_UNICODE *s1,
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002268 const Py_UNICODE *s2) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002269
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002270PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002271 Py_UNICODE *s1, const Py_UNICODE *s2) Py_DEPRECATED(3.3);
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002272
Martin v. Löwis5b222132007-06-10 09:51:05 +00002273PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002274 Py_UNICODE *s1,
2275 const Py_UNICODE *s2,
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002276 size_t n) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002277
2278PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002279 const Py_UNICODE *s1,
2280 const Py_UNICODE *s2
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002281 ) Py_DEPRECATED(3.3);
Victor Stinneref8d95c2010-08-16 22:03:11 +00002282
2283PyAPI_FUNC(int) Py_UNICODE_strncmp(
2284 const Py_UNICODE *s1,
2285 const Py_UNICODE *s2,
2286 size_t n
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002287 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002288
2289PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002290 const Py_UNICODE *s,
2291 Py_UNICODE c
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002292 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002293
Victor Stinner331ea922010-08-10 16:37:20 +00002294PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002295 const Py_UNICODE *s,
2296 Py_UNICODE c
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002297 ) Py_DEPRECATED(3.3);
Victor Stinner331ea922010-08-10 16:37:20 +00002298
Ethan Furmanb95b5612015-01-23 20:05:18 -08002299PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2300
Victor Stinner71133ff2010-09-01 23:43:53 +00002301/* Create a copy of a unicode string ending with a nul character. Return NULL
2302 and raise a MemoryError exception on memory allocation failure, otherwise
2303 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2304
Victor Stinner46408602010-09-03 16:18:00 +00002305PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002306 PyObject *unicode
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002307 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002308#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002309
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002310#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002311PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002312 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002313 int check_content);
T. Woutersa00c3fd2017-03-31 09:14:41 -07002314#elif !defined(NDEBUG)
2315/* For asserts that call _PyUnicode_CheckConsistency(), which would
2316 * otherwise be a problem when building with asserts but without Py_DEBUG. */
2317#define _PyUnicode_CheckConsistency(op, check_content) PyUnicode_Check(op)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002318#endif
2319
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002320#ifndef Py_LIMITED_API
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002321/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2322PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2323/* Clear all static strings. */
2324PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2325
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002326/* Fast equality check when the inputs are known to be exact unicode types
2327 and where the hash values are equal (i.e. a very probable match) */
2328PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002329#endif /* !Py_LIMITED_API */
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002330
Guido van Rossumd8225182000-03-10 22:33:05 +00002331#ifdef __cplusplus
2332}
2333#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002334#endif /* !Py_UNICODEOBJECT_H */