blob: bb91568cb4ebf6a3241c3350f780c7597cb0e3ae [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020093typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109# include <time.h>
110# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000111# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000112#endif
113
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200114/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200115 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700116typedef uint32_t Py_UCS4;
117typedef uint16_t Py_UCS2;
118typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119
Guido van Rossumd8225182000-03-10 22:33:05 +0000120/* --- Internal Unicode Operations ---------------------------------------- */
121
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000122/* Since splitting on whitespace is an important use case, and
123 whitespace in most situations is solely ASCII whitespace, we
124 optimize for the common case by using a quick look-up table
125 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000126
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000127 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000128#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000129#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000131
132#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
133#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
134#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
135#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
136
137#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
138#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
139#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
140
141#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
142#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
143#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000144#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000145
146#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
147#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
148#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
149
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000150#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000151
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000152#define Py_UNICODE_ISALNUM(ch) \
153 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_UNICODE_ISDECIMAL(ch) || \
155 Py_UNICODE_ISDIGIT(ch) || \
156 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200158#define Py_UNICODE_COPY(target, source, length) \
Christian Heimesf051e432016-09-13 20:22:02 +0200159 memcpy((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000160
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000161#define Py_UNICODE_FILL(target, value, length) \
162 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Martin Panter6d57fe12016-09-17 03:26:16 +0000163 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000164 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000165
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300166/* macros to work with surrogates */
Victor Stinner76df43d2012-10-30 01:42:39 +0100167#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
168#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
169#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300170/* Join two surrogate characters and return a single Py_UCS4 value. */
171#define Py_UNICODE_JOIN_SURROGATES(high, low) \
172 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
173 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100174/* high surrogate = top 10 bits added to D800 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200175#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
Victor Stinner551ac952011-11-29 22:58:13 +0100176/* low surrogate = bottom 10 bits added to DC00 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200177#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300178
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000179/* Check if substring matches at given offset. The offset must be
180 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000181
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200183 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
184 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
185 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
186
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000187#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000188
Barry Warsaw51ac5802000-03-20 16:36:48 +0000189#ifdef __cplusplus
190extern "C" {
191#endif
192
Guido van Rossumd8225182000-03-10 22:33:05 +0000193/* --- Unicode Type ------------------------------------------------------- */
194
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000195#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200196
197/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
198 structure. state.ascii and state.compact are set, and the data
199 immediately follow the structure. utf8_length and wstr_length can be found
200 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000201typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200202 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200203
204 - compact ascii:
205
206 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100207 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200208 * kind = PyUnicode_1BYTE_KIND
209 * compact = 1
210 * ascii = 1
211 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200212 * (length is the length of the utf8 and wstr strings)
213 * (data starts just after the structure)
214 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200215
216 - compact:
217
218 * structure = PyCompactUnicodeObject
Victor Stinner80bc72d2011-12-22 03:23:10 +0100219 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200220 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
221 PyUnicode_4BYTE_KIND
222 * compact = 1
223 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200224 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200225 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200226 * utf8_length = 0 if utf8 is NULL
227 * wstr is shared with data and wstr_length=length
228 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100229 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200230 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200231 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200232
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200233 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200234
235 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100236 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100237 * length = 0 (use wstr_length)
238 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200239 * kind = PyUnicode_WCHAR_KIND
240 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200242 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100243 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * wstr is not NULL
245 * data.any is NULL
246 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200247 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200253 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
254 PyUnicode_4BYTE_KIND
255 * compact = 0
256 * ready = 1
257 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200258 * utf8 is shared and utf8_length = length with data.any if ascii = 1
259 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100260 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200261 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
262 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
263 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 Compact strings use only one memory block (structure + characters),
266 whereas legacy strings use one block for the structure and one block
267 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200268
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200269 Legacy strings are created by PyUnicode_FromUnicode() and
270 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
271 when PyUnicode_READY() is called.
272
273 See also _PyUnicode_CheckConsistency().
274 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000275 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200276 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000277 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200278 struct {
279 /*
280 SSTATE_NOT_INTERNED (0)
281 SSTATE_INTERNED_MORTAL (1)
282 SSTATE_INTERNED_IMMORTAL (2)
283
284 If interned != SSTATE_NOT_INTERNED, the two references from the
285 dictionary to this object are *not* counted in ob_refcnt.
286 */
287 unsigned int interned:2;
288 /* Character size:
289
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200290 - PyUnicode_WCHAR_KIND (0):
291
292 * character type = wchar_t (16 or 32 bits, depending on the
293 platform)
294
295 - PyUnicode_1BYTE_KIND (1):
296
297 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100298 * all characters are in the range U+0000-U+00FF (latin1)
299 * if ascii is set, all characters are in the range U+0000-U+007F
300 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200301 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302
303 - PyUnicode_2BYTE_KIND (2):
304
305 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100306 * all characters are in the range U+0000-U+FFFF (BMP)
307 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200308
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200309 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200310
311 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100312 * all characters are in the range U+0000-U+10FFFF
313 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200315 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Compact is with respect to the allocation scheme. Compact unicode
317 objects only require one memory block while non-compact objects use
318 one block for the PyUnicodeObject struct and another for its data
319 buffer. */
320 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100321 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 unsigned int ascii:1;
325 /* The ready flag indicates whether the object layout is initialized
326 completely. This means that this is either a compact object, or
327 the data pointer is filled out. The bit is redundant, and helps
328 to minimize the test in PyUnicode_IS_READY(). */
329 unsigned int ready:1;
Antoine Pitrou8c6f8dc2014-03-23 22:55:03 +0100330 /* Padding to ensure that PyUnicode_DATA() is always aligned to
331 4 bytes (see issue #19537 on m68k). */
332 unsigned int :24;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 } state;
334 wchar_t *wstr; /* wchar_t representation (null-terminated) */
335} PyASCIIObject;
336
337/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200338 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 immediately follow the structure. */
340typedef struct {
341 PyASCIIObject _base;
342 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
343 * terminating \0. */
344 char *utf8; /* UTF-8 representation (null-terminated) */
345 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
346 * surrogates count as two code points. */
347} PyCompactUnicodeObject;
348
349/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
350 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200351 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200352typedef struct {
353 PyCompactUnicodeObject _base;
354 union {
355 void *any;
356 Py_UCS1 *latin1;
357 Py_UCS2 *ucs2;
358 Py_UCS4 *ucs4;
359 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000360} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000361#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000362
Mark Hammond91a681d2002-08-12 07:21:58 +0000363PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000364PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000365
Thomas Wouters27d517b2007-02-25 20:39:11 +0000366#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000367 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
368#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000369
370/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000371#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372
373#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200374 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 ((PyASCIIObject*)op)->length : \
376 ((PyCompactUnicodeObject*)op)->wstr_length)
377
378/* Returns the deprecated Py_UNICODE representation's size in code units
379 (this includes surrogate pairs as 2 units).
380 If the Py_UNICODE representation is not available, it will be computed
381 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
382
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100383#define PyUnicode_GET_SIZE(op) \
384 (assert(PyUnicode_Check(op)), \
385 (((PyASCIIObject *)(op))->wstr) ? \
386 PyUnicode_WSTR_LENGTH(op) : \
387 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
388 assert(((PyASCIIObject *)(op))->wstr), \
389 PyUnicode_WSTR_LENGTH(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200390 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200391
Guido van Rossumd8225182000-03-10 22:33:05 +0000392#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200394 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200395
396/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
397 representation on demand. Using this macro is very inefficient now,
398 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
399 use PyUnicode_WRITE() and PyUnicode_READ(). */
400
Guido van Rossumd8225182000-03-10 22:33:05 +0000401#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402 (assert(PyUnicode_Check(op)), \
403 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
404 PyUnicode_AsUnicode((PyObject *)(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200405 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406
Guido van Rossumd8225182000-03-10 22:33:05 +0000407#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 ((const char *)(PyUnicode_AS_UNICODE(op)))
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200409 /* Py_DEPRECATED(3.3) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410
411
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200412/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413
Victor Stinner6f9568b2011-11-17 00:12:44 +0100414/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200415
416/* Interning state. */
417#define SSTATE_NOT_INTERNED 0
418#define SSTATE_INTERNED_MORTAL 1
419#define SSTATE_INTERNED_IMMORTAL 2
420
Victor Stinnera3b334d2011-10-03 13:53:37 +0200421/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100422 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
423 ready. */
424#define PyUnicode_IS_ASCII(op) \
425 (assert(PyUnicode_Check(op)), \
426 assert(PyUnicode_IS_READY(op)), \
427 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200428
429/* Return true if the string is compact or 0 if not.
430 No type checks or Ready calls are performed. */
431#define PyUnicode_IS_COMPACT(op) \
432 (((PyASCIIObject*)(op))->state.compact)
433
434/* Return true if the string is a compact ASCII string (use PyASCIIObject
435 structure), or 0 if not. No type checks or Ready calls are performed. */
436#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100437 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200438
Victor Stinner52e2cc82011-12-19 22:14:45 +0100439enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200440/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200441 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442 has not been called yet. */
Victor Stinner52e2cc82011-12-19 22:14:45 +0100443 PyUnicode_WCHAR_KIND = 0,
444/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200445 PyUnicode_1BYTE_KIND = 1,
446 PyUnicode_2BYTE_KIND = 2,
447 PyUnicode_4BYTE_KIND = 4
448};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449
Georg Brandl4975a9b2011-10-05 16:12:21 +0200450/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200452 No checks are performed, use PyUnicode_KIND() before to ensure
453 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
455#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
456#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
457#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
458
Victor Stinner157f83f2011-09-28 21:41:31 +0200459/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460#define PyUnicode_KIND(op) \
461 (assert(PyUnicode_Check(op)), \
462 assert(PyUnicode_IS_READY(op)), \
463 ((PyASCIIObject *)(op))->state.kind)
464
Victor Stinner157f83f2011-09-28 21:41:31 +0200465/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200467 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200468 ((void*)((PyASCIIObject*)(op) + 1)) : \
469 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
470
471#define _PyUnicode_NONCOMPACT_DATA(op) \
472 (assert(((PyUnicodeObject*)(op))->data.any), \
473 ((((PyUnicodeObject *)(op))->data.any)))
474
475#define PyUnicode_DATA(op) \
476 (assert(PyUnicode_Check(op)), \
477 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
478 _PyUnicode_NONCOMPACT_DATA(op))
479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480/* In the access macros below, "kind" may be evaluated more than once.
481 All other macro parameters are evaluated exactly once, so it is safe
482 to put side effects into them (such as increasing the index). */
483
484/* Write into the canonical representation, this macro does not do any sanity
485 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200486 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200487 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200488 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200489#define PyUnicode_WRITE(kind, data, index, value) \
490 do { \
491 switch ((kind)) { \
492 case PyUnicode_1BYTE_KIND: { \
493 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
494 break; \
495 } \
496 case PyUnicode_2BYTE_KIND: { \
497 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
498 break; \
499 } \
500 default: { \
501 assert((kind) == PyUnicode_4BYTE_KIND); \
502 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
503 } \
504 } \
505 } while (0)
506
Georg Brandl07de3252011-10-05 16:47:38 +0200507/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200508 or ready calls are performed. */
509#define PyUnicode_READ(kind, data, index) \
510 ((Py_UCS4) \
511 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200512 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513 ((kind) == PyUnicode_2BYTE_KIND ? \
514 ((const Py_UCS2 *)(data))[(index)] : \
515 ((const Py_UCS4 *)(data))[(index)] \
516 ) \
517 ))
518
519/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
520 calls PyUnicode_KIND() and might call it twice. For single reads, use
521 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
522 cache kind and use PyUnicode_READ instead. */
523#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200524 (assert(PyUnicode_Check(unicode)), \
525 assert(PyUnicode_IS_READY(unicode)), \
526 (Py_UCS4) \
527 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
528 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
529 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
530 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
531 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
532 ) \
533 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534
535/* Returns the length of the unicode string. The caller has to make sure that
536 the string has it's canonical representation set before calling
537 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
538#define PyUnicode_GET_LENGTH(op) \
539 (assert(PyUnicode_Check(op)), \
540 assert(PyUnicode_IS_READY(op)), \
541 ((PyASCIIObject *)(op))->length)
542
543
544/* Fast check to determine whether an object is ready. Equivalent to
545 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
546
547#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
548
Victor Stinnera3b334d2011-10-03 13:53:37 +0200549/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200551 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552 Returns 0 on success and -1 on errors. */
553#define PyUnicode_READY(op) \
554 (assert(PyUnicode_Check(op)), \
555 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200556 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558/* Return a maximum character value which is suitable for creating another
559 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200560 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define PyUnicode_MAX_CHAR_VALUE(op) \
562 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200563 (PyUnicode_IS_ASCII(op) ? \
564 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200566 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200568 (0xffffU) : \
569 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000571#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000572
573/* --- Constants ---------------------------------------------------------- */
574
575/* This Unicode character will be used as replacement character during
576 decoding if the errors argument is set to "replace". Note: the
577 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
578 Unicode 3.0. */
579
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200580#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000581
582/* === Public API ========================================================= */
583
584/* --- Plain Py_UNICODE --------------------------------------------------- */
585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200586/* With PEP 393, this is the recommended way to allocate a new unicode object.
587 This function will allocate the object and its buffer in a single memory
588 block. Objects created using this function are not resizable. */
589#ifndef Py_LIMITED_API
590PyAPI_FUNC(PyObject*) PyUnicode_New(
591 Py_ssize_t size, /* Number of code points in the new string */
592 Py_UCS4 maxchar /* maximum code point value in the string */
593 );
594#endif
595
Benjamin Peterson82f34ad2015-01-13 09:17:24 -0500596/* Initializes the canonical string representation from the deprecated
Victor Stinnerd8f65102011-09-29 19:43:17 +0200597 wstr/Py_UNICODE representation. This function is used to convert Unicode
598 objects which were created using the old API to the new flexible format
599 introduced with PEP 393.
600
601 Don't call this function directly, use the public PyUnicode_READY() macro
602 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603#ifndef Py_LIMITED_API
604PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200605 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606 );
607#endif
608
Victor Stinner034f6cf2011-09-30 02:26:44 +0200609/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100610#ifndef Py_LIMITED_API
611PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200612 PyObject *unicode
613 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100614#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616/* Copy character from one unicode object into another, this function performs
Victor Stinner3fe55312012-01-04 00:33:50 +0100617 character conversion when necessary and falls back to memcpy() if possible.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618
Victor Stinner3fe55312012-01-04 00:33:50 +0100619 Fail if to is too small (smaller than *how_many* or smaller than
Victor Stinnera0702ab2011-09-29 14:14:38 +0200620 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
Victor Stinner3fe55312012-01-04 00:33:50 +0100621 kind(to), or if *to* has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200622
623 Return the number of written character, or return -1 and raise an exception
624 on error.
625
626 Pseudo-code:
627
628 how_many = min(how_many, len(from) - from_start)
629 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
630 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200631
632 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200633 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200635PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200636 PyObject *to,
637 Py_ssize_t to_start,
638 PyObject *from,
639 Py_ssize_t from_start,
640 Py_ssize_t how_many
641 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200642
643/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
644 may crash if parameters are invalid (e.g. if the output string
645 is too short). */
646PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
647 PyObject *to,
648 Py_ssize_t to_start,
649 PyObject *from,
650 Py_ssize_t from_start,
651 Py_ssize_t how_many
652 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653#endif
654
Victor Stinnerd3f08822012-05-29 12:57:52 +0200655#ifndef Py_LIMITED_API
Victor Stinner3fe55312012-01-04 00:33:50 +0100656/* Fill a string with a character: write fill_char into
657 unicode[start:start+length].
658
659 Fail if fill_char is bigger than the string maximum character, or if the
660 string has more than 1 reference.
661
662 Return the number of written character, or return -1 and raise an exception
663 on error. */
Victor Stinner3fe55312012-01-04 00:33:50 +0100664PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
665 PyObject *unicode,
666 Py_ssize_t start,
667 Py_ssize_t length,
668 Py_UCS4 fill_char
669 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200670
671/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
672 if parameters are invalid (e.g. if length is longer than the string). */
673PyAPI_FUNC(void) _PyUnicode_FastFill(
674 PyObject *unicode,
675 Py_ssize_t start,
676 Py_ssize_t length,
677 Py_UCS4 fill_char
678 );
Victor Stinner3fe55312012-01-04 00:33:50 +0100679#endif
680
Guido van Rossumd8225182000-03-10 22:33:05 +0000681/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000683
684 u may be NULL which causes the contents to be undefined. It is the
685 user's responsibility to fill in the needed data afterwards. Note
686 that modifying the Unicode object contents after construction is
687 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000688
689 The buffer is copied into the new object. */
690
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000691#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000692PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000693 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000694 Py_ssize_t size /* size of buffer */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200695 ) /* Py_DEPRECATED(3.3) */;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000696#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000697
Georg Brandl952867a2010-06-27 10:17:12 +0000698/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000699PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000700 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000701 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 );
703
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000704/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000706PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000707 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000708 );
709
Victor Stinnerd3f08822012-05-29 12:57:52 +0200710#ifndef Py_LIMITED_API
Victor Stinnerb9275c12011-10-05 14:01:42 +0200711/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
712 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
714 int kind,
715 const void *buffer,
716 Py_ssize_t size);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200717
718/* Create a new string from a buffer of ASCII characters.
719 WARNING: Don't check if the string contains any non-ASCII character. */
720PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
721 const char *buffer,
722 Py_ssize_t size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723#endif
724
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200725#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726PyAPI_FUNC(PyObject*) PyUnicode_Substring(
727 PyObject *str,
728 Py_ssize_t start,
729 Py_ssize_t end);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200730#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200731
Victor Stinnerece58de2012-04-23 23:36:38 +0200732#ifndef Py_LIMITED_API
733/* Compute the maximum character of the substring unicode[start:end].
734 Return 127 for an empty string. */
735PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
736 PyObject *unicode,
737 Py_ssize_t start,
738 Py_ssize_t end);
739#endif
740
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200741#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Georg Brandldb6c7f52011-10-07 11:19:11 +0200742/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300743 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 the buffer is smaller than the string. Return buffer on success.
745
746 buflen is the length of the buffer in (Py_UCS4) characters. */
747PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
748 PyObject *unicode,
749 Py_UCS4* buffer,
750 Py_ssize_t buflen,
751 int copy_null);
752
753/* Copy the string into a UCS4 buffer. A new buffer is allocated using
754 * PyMem_Malloc; if this fails, NULL is returned with a memory error
755 exception set. */
756PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200757#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +0300759#ifndef Py_LIMITED_API
Guido van Rossumd8225182000-03-10 22:33:05 +0000760/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200761 Py_UNICODE buffer.
762 If the wchar_t/Py_UNICODE representation is not yet available, this
763 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000764
Mark Hammond91a681d2002-08-12 07:21:58 +0000765PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 PyObject *unicode /* Unicode object */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200767 ) /* Py_DEPRECATED(3.3) */;
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +0300768
769/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
770 contains null characters. */
771PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
772 PyObject *unicode /* Unicode object */
773 );
Guido van Rossumd8225182000-03-10 22:33:05 +0000774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200775/* Return a read-only pointer to the Unicode object's internal
776 Py_UNICODE buffer and save the length at size.
777 If the wchar_t/Py_UNICODE representation is not yet available, this
778 function will calculate it. */
779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
781 PyObject *unicode, /* Unicode object */
782 Py_ssize_t *size /* location where to save the length */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200783 ) /* Py_DEPRECATED(3.3) */;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784#endif
785
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200786#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Guido van Rossumd8225182000-03-10 22:33:05 +0000787/* Get the length of the Unicode object. */
788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
790 PyObject *unicode
791);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200792#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793
Victor Stinner157f83f2011-09-28 21:41:31 +0200794/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 string representation. */
796
Martin v. Löwis18e16552006-02-15 17:27:45 +0000797PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 PyObject *unicode /* Unicode object */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200799 ) Py_DEPRECATED(3.3);
Guido van Rossumd8225182000-03-10 22:33:05 +0000800
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200801#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802/* Read a character from the string. */
803
804PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
805 PyObject *unicode,
806 Py_ssize_t index
807 );
808
809/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200810 PyUnicode_New, must not be shared, and must not have been hashed yet.
811
812 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813
814PyAPI_FUNC(int) PyUnicode_WriteChar(
815 PyObject *unicode,
816 Py_ssize_t index,
817 Py_UCS4 character
818 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +0200819#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000822/* Get the maximum ordinal for a Unicode character. */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +0200823PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000824#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000825
Martin Panter6245cb32016-04-15 02:14:19 +0000826/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100827 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
828 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000829
830 *unicode is modified to point to the new (resized) object and 0
831 returned on success.
832
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100833 Try to resize the string in place (which is usually faster than allocating
834 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000835
836 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100837 is returned and *unicode left untouched.
838
839 WARNING: The function doesn't check string content, the result may not be a
840 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000841
Mark Hammond91a681d2002-08-12 07:21:58 +0000842PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 PyObject **unicode, /* Pointer to the Unicode object */
844 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000845 );
846
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300847/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000848
Martin Panter20d32552016-04-15 00:56:21 +0000849 bytes, bytearray and other bytes-like objects are decoded according to the
850 given encoding and error handler. The encoding and error handler can be
851 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000852
Martin Panter20d32552016-04-15 00:56:21 +0000853 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000854
855 The API returns NULL in case of an error. The caller is responsible
856 for decref'ing the returned objects.
857
858*/
859
Mark Hammond91a681d2002-08-12 07:21:58 +0000860PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200861 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000862 const char *encoding, /* encoding */
863 const char *errors /* error handling */
864 );
865
Martin Panter20d32552016-04-15 00:56:21 +0000866/* Copy an instance of a Unicode subtype to a new true Unicode object if
867 necessary. If obj is already a true Unicode object (not a subtype), return
868 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000869
870 The API returns NULL in case of an error. The caller is responsible
871 for decref'ing the returned objects.
872
873*/
874
Mark Hammond91a681d2002-08-12 07:21:58 +0000875PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000877 );
878
Victor Stinner1205f272010-09-11 00:54:47 +0000879PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
880 const char *format, /* ASCII-encoded string */
881 va_list vargs
882 );
883PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
884 const char *format, /* ASCII-encoded string */
885 ...
886 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000888#ifndef Py_LIMITED_API
Victor Stinnerd3f08822012-05-29 12:57:52 +0200889typedef struct {
890 PyObject *buffer;
891 void *data;
892 enum PyUnicode_Kind kind;
893 Py_UCS4 maxchar;
894 Py_ssize_t size;
895 Py_ssize_t pos;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200896
897 /* minimum number of allocated characters (default: 0) */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200898 Py_ssize_t min_length;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200899
900 /* minimum character (default: 127, ASCII) */
901 Py_UCS4 min_char;
902
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200903 /* If non-zero, overallocate the buffer (default: 0). */
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200904 unsigned char overallocate;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200905
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200906 /* If readonly is 1, buffer is a shared string (cannot be modified)
907 and size is set to 0. */
908 unsigned char readonly;
Victor Stinnerd3f08822012-05-29 12:57:52 +0200909} _PyUnicodeWriter ;
910
911/* Initialize a Unicode writer.
Victor Stinner8f674cc2013-04-17 23:02:17 +0200912 *
913 * By default, the minimum buffer size is 0 character and overallocation is
914 * disabled. Set min_length, min_char and overallocate attributes to control
915 * the allocation of the buffer. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200916PyAPI_FUNC(void)
Victor Stinner8f674cc2013-04-17 23:02:17 +0200917_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200918
919/* Prepare the buffer to write 'length' characters
920 with the specified maximum character.
921
922 Return 0 on success, raise an exception and return -1 on error. */
923#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
924 (((MAXCHAR) <= (WRITER)->maxchar \
925 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
926 ? 0 \
927 : (((LENGTH) == 0) \
928 ? 0 \
929 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
930
931/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
932 instead. */
933PyAPI_FUNC(int)
934_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
935 Py_ssize_t length, Py_UCS4 maxchar);
936
Victor Stinnerca9381e2015-09-22 00:58:32 +0200937/* Prepare the buffer to have at least the kind KIND.
938 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
939 support characters in range U+000-U+FFFF.
940
941 Return 0 on success, raise an exception and return -1 on error. */
942#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
943 (assert((KIND) != PyUnicode_WCHAR_KIND), \
944 (KIND) <= (WRITER)->kind \
945 ? 0 \
946 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
947
948/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
949 macro instead. */
950PyAPI_FUNC(int)
951_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
952 enum PyUnicode_Kind kind);
953
Victor Stinnera0dd0212013-04-11 22:09:04 +0200954/* Append a Unicode character.
955 Return 0 on success, raise an exception and return -1 on error. */
956PyAPI_FUNC(int)
957_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
958 Py_UCS4 ch
959 );
960
Victor Stinnere215d962012-10-06 23:03:36 +0200961/* Append a Unicode string.
962 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200963PyAPI_FUNC(int)
Victor Stinnere215d962012-10-06 23:03:36 +0200964_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
965 PyObject *str /* Unicode string */
966 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200967
Victor Stinnercfc4c132013-04-03 01:48:39 +0200968/* Append a substring of a Unicode string.
969 Return 0 on success, raise an exception and return -1 on error. */
970PyAPI_FUNC(int)
971_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
972 PyObject *str, /* Unicode string */
973 Py_ssize_t start,
974 Py_ssize_t end
975 );
976
Serhiy Storchakad65c9492015-11-02 14:10:23 +0200977/* Append an ASCII-encoded byte string.
Victor Stinner4a587072013-11-19 12:54:53 +0100978 Return 0 on success, raise an exception and return -1 on error. */
979PyAPI_FUNC(int)
980_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
981 const char *str, /* ASCII-encoded byte string */
982 Py_ssize_t len /* number of bytes, or -1 if unknown */
983 );
984
Victor Stinnere215d962012-10-06 23:03:36 +0200985/* Append a latin1-encoded byte string.
986 Return 0 on success, raise an exception and return -1 on error. */
987PyAPI_FUNC(int)
Victor Stinner4a587072013-11-19 12:54:53 +0100988_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
989 const char *str, /* latin1-encoded byte string */
990 Py_ssize_t len /* length in bytes */
Victor Stinnere215d962012-10-06 23:03:36 +0200991 );
992
Martin Panter6245cb32016-04-15 02:14:19 +0000993/* Get the value of the writer as a Unicode string. Clear the
Victor Stinnere215d962012-10-06 23:03:36 +0200994 buffer of the writer. Raise an exception and return NULL
995 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200996PyAPI_FUNC(PyObject *)
997_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
998
Victor Stinnere215d962012-10-06 23:03:36 +0200999/* Deallocate memory of a writer (clear its internal buffer). */
Victor Stinnerd3f08822012-05-29 12:57:52 +02001000PyAPI_FUNC(void)
1001_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
1002#endif
1003
1004#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +00001005/* Format the object based on the format_spec, as defined in PEP 3101
1006 (Advanced String Formatting). */
Victor Stinnerd3f08822012-05-29 12:57:52 +02001007PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
1008 _PyUnicodeWriter *writer,
1009 PyObject *obj,
1010 PyObject *format_spec,
1011 Py_ssize_t start,
1012 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001013#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +00001014
Walter Dörwald16807132007-05-25 13:52:07 +00001015PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1016PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001017PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1018 const char *u /* UTF-8 encoded string */
1019 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001020#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +00001021PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001022#endif
Walter Dörwald16807132007-05-25 13:52:07 +00001023
1024/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001025#define PyUnicode_CHECK_INTERNED(op) \
1026 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +00001027
Guido van Rossumd8225182000-03-10 22:33:05 +00001028/* --- wchar_t support for platforms which support it --------------------- */
1029
1030#ifdef HAVE_WCHAR_H
1031
Georg Brandl952867a2010-06-27 10:17:12 +00001032/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +00001033 size.
1034
1035 The buffer is copied into the new object. */
1036
Mark Hammond91a681d2002-08-12 07:21:58 +00001037PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001038 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001039 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 );
1041
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001042/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +00001043 most size wchar_t characters are copied.
1044
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001045 Note that the resulting wchar_t string may or may not be
1046 0-terminated. It is the responsibility of the caller to make sure
1047 that the wchar_t string is 0-terminated in case this is required by
1048 the application.
1049
1050 Returns the number of wchar_t characters copied (excluding a
1051 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +00001052 error. */
1053
Martin v. Löwis18e16552006-02-15 17:27:45 +00001054PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001055 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001056 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001057 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001058 );
1059
Victor Stinner137c34c2010-09-29 10:25:54 +00001060/* Convert the Unicode object to a wide character string. The output string
1061 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +02001062 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +00001063
Victor Stinner22fabe22015-02-11 18:17:56 +01001064 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +00001065 on success. On error, returns NULL, *size is undefined and raises a
1066 MemoryError. */
1067
1068PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001069 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +00001070 Py_ssize_t *size /* number of characters of the result */
1071 );
1072
Victor Stinner9f789e72011-10-01 03:57:28 +02001073#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +02001075#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076
Guido van Rossumd8225182000-03-10 22:33:05 +00001077#endif
1078
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079/* --- Unicode ordinals --------------------------------------------------- */
1080
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081/* Create a Unicode Object from the given Unicode code point ordinal.
1082
Ezio Melottie7f90372012-10-05 03:33:31 +03001083 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084 raised in case it is not.
1085
1086*/
1087
Marc-André Lemburg9c329de2002-08-12 08:19:10 +00001088PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001089
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001090/* --- Free-list management ----------------------------------------------- */
1091
1092/* Clear the free list used by the Unicode implementation.
1093
1094 This can be used to release memory used for objects on the free
1095 list back to the Python memory allocator.
1096
1097*/
1098
1099PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1100
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +00001102
1103 Many of these APIs take two arguments encoding and errors. These
1104 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001105 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +00001106
Georg Brandl952867a2010-06-27 10:17:12 +00001107 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +00001108
1109 Error handling is set by errors which may also be set to NULL
1110 meaning to use the default handling defined for the codec. Default
1111 error handling for all builtin codecs is "strict" (ValueErrors are
1112 raised).
1113
1114 The codecs all use a similar interface. Only deviation from the
1115 generic ones are documented.
1116
1117*/
1118
Fred Drakecb093fe2000-05-09 19:51:53 +00001119/* --- Manage the default encoding ---------------------------------------- */
1120
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001121/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001122 Unicode object unicode and the size of the encoded representation
1123 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +00001124
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001125 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001126
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001127 This function caches the UTF-8 encoded string in the unicodeobject
1128 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 when the unicodeobject is deallocated.
1130
1131 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1132 support the previous internal function with the same behaviour.
1133
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001134 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001135 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001136
1137 *** If you need to access the Unicode object as UTF-8 bytes string,
1138 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +00001139*/
1140
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001141#ifndef Py_LIMITED_API
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02001142PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001144 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001146#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001147
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001148/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001149 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1152 in the unicodeobject.
1153
1154 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1155 support the previous internal function with the same behaviour.
1156
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001157 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001158 extracted from the returned data.
1159
1160 *** This API is for interpreter INTERNAL USE ONLY and will likely
1161 *** be removed or changed for Python 3.1.
1162
1163 *** If you need to access the Unicode object as UTF-8 bytes string,
1164 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001165
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001166*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001167
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001168#ifndef Py_LIMITED_API
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02001169PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001171#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001172
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001173/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001174
Mark Hammond91a681d2002-08-12 07:21:58 +00001175PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001176
Guido van Rossumd8225182000-03-10 22:33:05 +00001177/* --- Generic Codecs ----------------------------------------------------- */
1178
1179/* Create a Unicode object by decoding the encoded string s of the
1180 given size. */
1181
Mark Hammond91a681d2002-08-12 07:21:58 +00001182PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001183 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001184 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001185 const char *encoding, /* encoding */
1186 const char *errors /* error handling */
1187 );
1188
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189/* Decode a Unicode object unicode and return the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +03001190 object.
1191
1192 This API is DEPRECATED. The only supported standard encoding is rot13.
1193 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1194 that decode from str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001195
1196PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 PyObject *unicode, /* Unicode object */
1198 const char *encoding, /* encoding */
1199 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001200 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001201
1202/* Decode a Unicode object unicode and return the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +03001203 object.
1204
1205 This API is DEPRECATED. The only supported standard encoding is rot13.
1206 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1207 that decode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001208
1209PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210 PyObject *unicode, /* Unicode object */
1211 const char *encoding, /* encoding */
1212 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001213 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001214
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001216 Python string object. */
1217
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001218#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001219PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001220 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001222 const char *encoding, /* encoding */
1223 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001224 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001225#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001226
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001227/* Encodes a Unicode object and returns the result as Python
Serhiy Storchaka00939072016-10-27 21:05:49 +03001228 object.
1229
1230 This API is DEPRECATED. It is superceeded by PyUnicode_AsEncodedString()
1231 since all standard encodings (except rot13) encode str to bytes.
1232 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
1233 that encode form str to non-bytes. */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001234
1235PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001236 PyObject *unicode, /* Unicode object */
1237 const char *encoding, /* encoding */
1238 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001239 ) Py_DEPRECATED(3.6);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001240
Guido van Rossumd8225182000-03-10 22:33:05 +00001241/* Encodes a Unicode object and returns the result as Python string
1242 object. */
1243
Mark Hammond91a681d2002-08-12 07:21:58 +00001244PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 PyObject *unicode, /* Unicode object */
1246 const char *encoding, /* encoding */
1247 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001248 );
1249
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001250/* Encodes a Unicode object and returns the result as Unicode
Serhiy Storchaka00939072016-10-27 21:05:49 +03001251 object.
1252
1253 This API is DEPRECATED. The only supported standard encodings is rot13.
1254 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
1255 that encode from str to str. */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001256
1257PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 PyObject *unicode, /* Unicode object */
1259 const char *encoding, /* encoding */
1260 const char *errors /* error handling */
Serhiy Storchaka00939072016-10-27 21:05:49 +03001261 ) Py_DEPRECATED(3.6);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001262
1263/* Build an encoding map. */
1264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001265PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1266 PyObject* string /* 256 character map */
1267 );
1268
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001269/* --- UTF-7 Codecs ------------------------------------------------------- */
1270
Mark Hammond91a681d2002-08-12 07:21:58 +00001271PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 const char *string, /* UTF-7 encoded string */
1273 Py_ssize_t length, /* size of string */
1274 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001275 );
1276
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001277PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 const char *string, /* UTF-7 encoded string */
1279 Py_ssize_t length, /* size of string */
1280 const char *errors, /* error handling */
1281 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001282 );
1283
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001284#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001285PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 const Py_UNICODE *data, /* Unicode char buffer */
1287 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1288 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1289 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1290 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001291 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001292PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1293 PyObject *unicode, /* Unicode object */
1294 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1295 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1296 const char *errors /* error handling */
1297 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001298#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001299
Guido van Rossumd8225182000-03-10 22:33:05 +00001300/* --- UTF-8 Codecs ------------------------------------------------------- */
1301
Mark Hammond91a681d2002-08-12 07:21:58 +00001302PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 const char *string, /* UTF-8 encoded string */
1304 Py_ssize_t length, /* size of string */
1305 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001306 );
1307
Walter Dörwald69652032004-09-07 20:24:22 +00001308PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 const char *string, /* UTF-8 encoded string */
1310 Py_ssize_t length, /* size of string */
1311 const char *errors, /* error handling */
1312 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001313 );
1314
Mark Hammond91a681d2002-08-12 07:21:58 +00001315PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001317 );
1318
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001319#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1321 PyObject *unicode,
1322 const char *errors);
1323
Mark Hammond91a681d2002-08-12 07:21:58 +00001324PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 const Py_UNICODE *data, /* Unicode char buffer */
1326 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1327 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001328 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001329#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001330
Walter Dörwald41980ca2007-08-16 21:55:45 +00001331/* --- UTF-32 Codecs ------------------------------------------------------ */
1332
1333/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1334 the corresponding Unicode object.
1335
1336 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001338
1339 If byteorder is non-NULL, the decoder starts decoding using the
1340 given byte order:
1341
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 *byteorder == -1: little endian
1343 *byteorder == 0: native order
1344 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001345
1346 In native mode, the first four bytes of the stream are checked for a
1347 BOM mark. If found, the BOM mark is analysed, the byte order
1348 adjusted and the BOM skipped. In the other modes, no BOM mark
1349 interpretation is done. After completion, *byteorder is set to the
1350 current byte order at the end of input data.
1351
1352 If byteorder is NULL, the codec starts in native order mode.
1353
1354*/
1355
1356PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 const char *string, /* UTF-32 encoded string */
1358 Py_ssize_t length, /* size of string */
1359 const char *errors, /* error handling */
1360 int *byteorder /* pointer to byteorder to use
1361 0=native;-1=LE,1=BE; updated on
1362 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001363 );
1364
1365PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 const char *string, /* UTF-32 encoded string */
1367 Py_ssize_t length, /* size of string */
1368 const char *errors, /* error handling */
1369 int *byteorder, /* pointer to byteorder to use
1370 0=native;-1=LE,1=BE; updated on
1371 exit */
1372 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001373 );
1374
1375/* Returns a Python string using the UTF-32 encoding in native byte
1376 order. The string always starts with a BOM mark. */
1377
1378PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001380 );
1381
1382/* Returns a Python string object holding the UTF-32 encoded value of
1383 the Unicode data.
1384
1385 If byteorder is not 0, output is written according to the following
1386 byte order:
1387
1388 byteorder == -1: little endian
1389 byteorder == 0: native byte order (writes a BOM mark)
1390 byteorder == 1: big endian
1391
1392 If byteorder is 0, the output string will always start with the
1393 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1394 prepended.
1395
1396*/
1397
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001398#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001399PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 const Py_UNICODE *data, /* Unicode char buffer */
1401 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1402 const char *errors, /* error handling */
1403 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001404 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001405PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1406 PyObject *object, /* Unicode object */
1407 const char *errors, /* error handling */
1408 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1409 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001410#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001411
Guido van Rossumd8225182000-03-10 22:33:05 +00001412/* --- UTF-16 Codecs ------------------------------------------------------ */
1413
Guido van Rossum9e896b32000-04-05 20:11:21 +00001414/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001415 the corresponding Unicode object.
1416
1417 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001419
1420 If byteorder is non-NULL, the decoder starts decoding using the
1421 given byte order:
1422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 *byteorder == -1: little endian
1424 *byteorder == 0: native order
1425 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001426
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001427 In native mode, the first two bytes of the stream are checked for a
1428 BOM mark. If found, the BOM mark is analysed, the byte order
1429 adjusted and the BOM skipped. In the other modes, no BOM mark
1430 interpretation is done. After completion, *byteorder is set to the
1431 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001432
1433 If byteorder is NULL, the codec starts in native order mode.
1434
1435*/
1436
Mark Hammond91a681d2002-08-12 07:21:58 +00001437PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 const char *string, /* UTF-16 encoded string */
1439 Py_ssize_t length, /* size of string */
1440 const char *errors, /* error handling */
1441 int *byteorder /* pointer to byteorder to use
1442 0=native;-1=LE,1=BE; updated on
1443 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001444 );
1445
Walter Dörwald69652032004-09-07 20:24:22 +00001446PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 const char *string, /* UTF-16 encoded string */
1448 Py_ssize_t length, /* size of string */
1449 const char *errors, /* error handling */
1450 int *byteorder, /* pointer to byteorder to use
1451 0=native;-1=LE,1=BE; updated on
1452 exit */
1453 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001454 );
1455
Guido van Rossumd8225182000-03-10 22:33:05 +00001456/* Returns a Python string using the UTF-16 encoding in native byte
1457 order. The string always starts with a BOM mark. */
1458
Mark Hammond91a681d2002-08-12 07:21:58 +00001459PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001461 );
1462
1463/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001464 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001465
1466 If byteorder is not 0, output is written according to the following
1467 byte order:
1468
1469 byteorder == -1: little endian
1470 byteorder == 0: native byte order (writes a BOM mark)
1471 byteorder == 1: big endian
1472
1473 If byteorder is 0, the output string will always start with the
1474 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1475 prepended.
1476
1477 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1478 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001479 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001480
1481*/
1482
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001483#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001484PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 const Py_UNICODE *data, /* Unicode char buffer */
1486 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1487 const char *errors, /* error handling */
1488 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001489 ) Py_DEPRECATED(3.3);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001490PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1491 PyObject* unicode, /* Unicode object */
1492 const char *errors, /* error handling */
1493 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1494 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001496
1497/* --- Unicode-Escape Codecs ---------------------------------------------- */
1498
Mark Hammond91a681d2002-08-12 07:21:58 +00001499PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001500 const char *string, /* Unicode-Escape encoded string */
1501 Py_ssize_t length, /* size of string */
1502 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001503 );
1504
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001505#ifndef Py_LIMITED_API
Eric V. Smith56466482016-10-31 14:46:26 -04001506/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
1507 chars. */
1508PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
1509 const char *string, /* Unicode-Escape encoded string */
1510 Py_ssize_t length, /* size of string */
1511 const char *errors, /* error handling */
1512 const char **first_invalid_escape /* on return, points to first
1513 invalid escaped char in
1514 string. */
1515);
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001516#endif
Eric V. Smith56466482016-10-31 14:46:26 -04001517
Mark Hammond91a681d2002-08-12 07:21:58 +00001518PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001519 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001520 );
1521
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001522#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001523PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 const Py_UNICODE *data, /* Unicode char buffer */
1525 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001526 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001527#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001528
1529/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1530
Mark Hammond91a681d2002-08-12 07:21:58 +00001531PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 const char *string, /* Raw-Unicode-Escape encoded string */
1533 Py_ssize_t length, /* size of string */
1534 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001535 );
1536
Mark Hammond91a681d2002-08-12 07:21:58 +00001537PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001538 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001539 );
1540
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001541#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001542PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 const Py_UNICODE *data, /* Unicode char buffer */
1544 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001545 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001546#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001547
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001548/* --- Unicode Internal Codec ---------------------------------------------
1549
1550 Only for internal use in _codecsmodule.c */
1551
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001552#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001553PyObject *_PyUnicode_DecodeUnicodeInternal(
1554 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001555 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001556 const char *errors
1557 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001558#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001559
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001560/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001561
1562 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1563
1564*/
1565
Mark Hammond91a681d2002-08-12 07:21:58 +00001566PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 const char *string, /* Latin-1 encoded string */
1568 Py_ssize_t length, /* size of string */
1569 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001570 );
1571
Mark Hammond91a681d2002-08-12 07:21:58 +00001572PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001573 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001574 );
1575
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001576#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1578 PyObject* unicode,
1579 const char* errors);
1580
Mark Hammond91a681d2002-08-12 07:21:58 +00001581PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001582 const Py_UNICODE *data, /* Unicode char buffer */
1583 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1584 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001585 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001586#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001587
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001588/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001589
1590 Only 7-bit ASCII data is excepted. All other codes generate errors.
1591
1592*/
1593
Mark Hammond91a681d2002-08-12 07:21:58 +00001594PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 const char *string, /* ASCII encoded string */
1596 Py_ssize_t length, /* size of string */
1597 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001598 );
1599
Mark Hammond91a681d2002-08-12 07:21:58 +00001600PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001602 );
1603
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001604#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1606 PyObject* unicode,
1607 const char* errors);
1608
Mark Hammond91a681d2002-08-12 07:21:58 +00001609PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 const Py_UNICODE *data, /* Unicode char buffer */
1611 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1612 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001613 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001614#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001615
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001616/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001617
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001619
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001620 Decoding mappings must map byte ordinals (integers in the range from 0 to
1621 255) to Unicode strings, integers (which are then interpreted as Unicode
1622 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
1623 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
1624 mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +00001625
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001626 Encoding mappings must map Unicode ordinal integers to bytes objects,
1627 integers in the range from 0 to 255 or None. Unmapped character
1628 ordinals (ones which cause a LookupError) as well as mapped to
1629 None are treated as "undefined mapping" and cause an error.
Guido van Rossumd8225182000-03-10 22:33:05 +00001630
1631*/
1632
Mark Hammond91a681d2002-08-12 07:21:58 +00001633PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 const char *string, /* Encoded string */
1635 Py_ssize_t length, /* size of string */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001636 PyObject *mapping, /* decoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001638 );
1639
Mark Hammond91a681d2002-08-12 07:21:58 +00001640PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001642 PyObject *mapping /* encoding mapping */
Guido van Rossumd8225182000-03-10 22:33:05 +00001643 );
1644
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001645#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001646PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 const Py_UNICODE *data, /* Unicode char buffer */
1648 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001649 PyObject *mapping, /* encoding mapping */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001651 ) Py_DEPRECATED(3.3);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001652PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1653 PyObject *unicode, /* Unicode object */
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001654 PyObject *mapping, /* encoding mapping */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001655 const char *errors /* error handling */
1656 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001657#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001658
1659/* Translate a Py_UNICODE buffer of the given length by applying a
1660 character mapping table to it and return the resulting Unicode
1661 object.
1662
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001663 The mapping table must map Unicode ordinal integers to Unicode strings,
1664 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001665
1666 Mapping tables may be dictionaries or sequences. Unmapped character
1667 ordinals (ones which cause a LookupError) are left untouched and
1668 are copied as-is.
1669
1670*/
1671
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001672#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001673PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 const Py_UNICODE *data, /* Unicode char buffer */
1675 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1676 PyObject *table, /* Translate table */
1677 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001678 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001679#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001680
Steve Dowercc16be82016-09-08 10:35:16 -07001681#ifdef MS_WINDOWS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001682
Guido van Rossumefec1152000-03-28 02:01:15 +00001683/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001684
Mark Hammond91a681d2002-08-12 07:21:58 +00001685PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001686 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -07001687 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001688 const char *errors /* error handling */
1689 );
1690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001691PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1692 const char *string, /* MBCS encoded string */
1693 Py_ssize_t length, /* size of string */
1694 const char *errors, /* error handling */
1695 Py_ssize_t *consumed /* bytes consumed */
1696 );
1697
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001698#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +02001699PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1700 int code_page, /* code page number */
1701 const char *string, /* encoded string */
1702 Py_ssize_t length, /* size of string */
1703 const char *errors, /* error handling */
1704 Py_ssize_t *consumed /* bytes consumed */
1705 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001706#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02001707
Mark Hammond91a681d2002-08-12 07:21:58 +00001708PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001709 PyObject *unicode /* Unicode object */
1710 );
1711
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001712#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001713PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001714 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001715 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001716 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001717 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001718#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001719
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001720#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinner3a50e702011-10-18 21:21:00 +02001721PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1722 int code_page, /* code page number */
1723 PyObject *unicode, /* Unicode object */
1724 const char *errors /* error handling */
1725 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001726#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02001727
Steve Dowercc16be82016-09-08 10:35:16 -07001728#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001729
Guido van Rossum9e896b32000-04-05 20:11:21 +00001730/* --- Decimal Encoder ---------------------------------------------------- */
1731
1732/* Takes a Unicode string holding a decimal value and writes it into
1733 an output buffer using standard ASCII digit codes.
1734
1735 The output buffer has to provide at least length+1 bytes of storage
1736 area. The output string is 0-terminated.
1737
1738 The encoder converts whitespace to ' ', decimal characters to their
1739 corresponding ASCII digit and all other Latin-1 characters except
1740 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1741 are treated as errors. This includes embedded NULL bytes.
1742
1743 Error handling is defined by the errors argument:
1744
1745 NULL or "strict": raise a ValueError
1746 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001748 "replace": replaces illegal characters with '?'
1749
1750 Returns 0 on success, -1 on failure.
1751
1752*/
1753
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001754#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001755PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 Py_UNICODE *s, /* Unicode buffer */
1757 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1758 char *output, /* Output buffer; must have size >= length */
1759 const char *errors /* error handling */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001760 ) /* Py_DEPRECATED(3.3) */;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001761#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001762
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001763/* Transforms code points that have decimal digit property to the
1764 corresponding ASCII digit code points.
1765
1766 Returns a new Unicode string on success, NULL on failure.
1767*/
1768
Georg Brandlb5503082010-12-05 11:40:48 +00001769#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001770PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1771 Py_UNICODE *s, /* Unicode buffer */
1772 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001773 ) /* Py_DEPRECATED(3.3) */;
Georg Brandlb5503082010-12-05 11:40:48 +00001774#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001775
Victor Stinner6f9568b2011-11-17 00:12:44 +01001776/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 as argument instead of a raw buffer and length. This function additionally
1778 transforms spaces to ASCII because this is what the callers in longobject,
1779 floatobject, and complexobject did anyways. */
1780
1781#ifndef Py_LIMITED_API
1782PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1783 PyObject *unicode /* Unicode object */
1784 );
1785#endif
1786
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001787/* --- Locale encoding --------------------------------------------------- */
1788
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001789#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001790/* Decode a string from the current locale encoding. The decoder is strict if
1791 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1792 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1793 be decoded as a surrogate character and *surrogateescape* is not equal to
1794 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1795 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001796 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001797
1798PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1799 const char *str,
1800 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001801 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001802
1803/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1804 length using strlen(). */
1805
1806PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1807 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001808 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001809
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001810/* Encode a Unicode object to the current locale encoding. The encoder is
1811 strict is *surrogateescape* is equal to zero, otherwise the
1812 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +01001813 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001814
1815PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1816 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001817 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001818 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02001819#endif
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001820
Martin v. Löwis011e8422009-05-05 04:43:17 +00001821/* --- File system encoding ---------------------------------------------- */
1822
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001823/* ParseTuple converter: encode str objects to bytes using
1824 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001825
1826PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1827
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001828/* ParseTuple converter: decode bytes objects to unicode using
1829 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1830
1831PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1832
Victor Stinner77c38622010-05-14 15:58:55 +00001833/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1834 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001835
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001836 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1837 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001838
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001839 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001840*/
1841
1842PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1843 const char *s /* encoded string */
1844 );
1845
Victor Stinner77c38622010-05-14 15:58:55 +00001846/* Decode a string using Py_FileSystemDefaultEncoding
1847 and the "surrogateescape" error handler.
1848
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001849 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1850 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001851*/
1852
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1854 const char *s, /* encoded string */
1855 Py_ssize_t size /* size */
1856 );
1857
Victor Stinnerae6265f2010-05-15 16:27:27 +00001858/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001859 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001860
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001861 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1862 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001863*/
1864
1865PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1866 PyObject *unicode
1867 );
1868
Guido van Rossumd8225182000-03-10 22:33:05 +00001869/* --- Methods & Slots ----------------------------------------------------
1870
1871 These are capable of handling Unicode objects and strings on input
1872 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001873 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001874
1875/* Concat two strings giving a new Unicode string. */
1876
Mark Hammond91a681d2002-08-12 07:21:58 +00001877PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001878 PyObject *left, /* Left string */
1879 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001880 );
1881
Walter Dörwald1ab83302007-05-18 17:15:44 +00001882/* Concat two strings and put the result in *pleft
1883 (sets *pleft to NULL on error) */
1884
1885PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001886 PyObject **pleft, /* Pointer to left string */
1887 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001888 );
1889
1890/* Concat two strings, put the result in *pleft and drop the right object
1891 (sets *pleft to NULL on error) */
1892
1893PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001894 PyObject **pleft, /* Pointer to left string */
1895 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001896 );
1897
Guido van Rossumd8225182000-03-10 22:33:05 +00001898/* Split a string giving a list of Unicode strings.
1899
1900 If sep is NULL, splitting will be done at all whitespace
1901 substrings. Otherwise, splits occur at the given separator.
1902
1903 At most maxsplit splits will be done. If negative, no limit is set.
1904
1905 Separators are not included in the resulting list.
1906
1907*/
1908
Mark Hammond91a681d2002-08-12 07:21:58 +00001909PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001910 PyObject *s, /* String to split */
1911 PyObject *sep, /* String separator */
1912 Py_ssize_t maxsplit /* Maxsplit count */
1913 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001914
1915/* Dito, but split at line breaks.
1916
1917 CRLF is considered to be one line break. Line breaks are not
1918 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001919
Mark Hammond91a681d2002-08-12 07:21:58 +00001920PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001921 PyObject *s, /* String to split */
1922 int keepends /* If true, line end markers are included */
1923 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001924
Thomas Wouters477c8d52006-05-27 19:21:47 +00001925/* Partition a string using a given separator. */
1926
1927PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001928 PyObject *s, /* String to partition */
1929 PyObject *sep /* String separator */
1930 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001931
1932/* Partition a string using a given separator, searching from the end of the
1933 string. */
1934
1935PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001936 PyObject *s, /* String to partition */
1937 PyObject *sep /* String separator */
1938 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001939
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001940/* Split a string giving a list of Unicode strings.
1941
1942 If sep is NULL, splitting will be done at all whitespace
1943 substrings. Otherwise, splits occur at the given separator.
1944
1945 At most maxsplit splits will be done. But unlike PyUnicode_Split
1946 PyUnicode_RSplit splits from the end of the string. If negative,
1947 no limit is set.
1948
1949 Separators are not included in the resulting list.
1950
1951*/
1952
1953PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001954 PyObject *s, /* String to split */
1955 PyObject *sep, /* String separator */
1956 Py_ssize_t maxsplit /* Maxsplit count */
1957 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001958
Guido van Rossumd8225182000-03-10 22:33:05 +00001959/* Translate a string by applying a character mapping table to it and
1960 return the resulting Unicode object.
1961
Serhiy Storchakac85a2662017-03-19 08:15:17 +02001962 The mapping table must map Unicode ordinal integers to Unicode strings,
1963 Unicode ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001964
1965 Mapping tables may be dictionaries or sequences. Unmapped character
1966 ordinals (ones which cause a LookupError) are left untouched and
1967 are copied as-is.
1968
1969*/
1970
Mark Hammond91a681d2002-08-12 07:21:58 +00001971PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001972 PyObject *str, /* String */
1973 PyObject *table, /* Translate table */
1974 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001975 );
1976
1977/* Join a sequence of strings using the given separator and return
1978 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001979
Mark Hammond91a681d2002-08-12 07:21:58 +00001980PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001981 PyObject *separator, /* Separator string */
1982 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001983 );
1984
Serhiy Storchakaea525a22016-09-06 22:07:53 +03001985#ifndef Py_LIMITED_API
1986PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1987 PyObject *separator,
1988 PyObject **items,
1989 Py_ssize_t seqlen
1990 );
1991#endif /* Py_LIMITED_API */
1992
Guido van Rossumd8225182000-03-10 22:33:05 +00001993/* Return 1 if substr matches str[start:end] at the given tail end, 0
1994 otherwise. */
1995
Martin v. Löwis18e16552006-02-15 17:27:45 +00001996PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001997 PyObject *str, /* String */
1998 PyObject *substr, /* Prefix or Suffix string */
1999 Py_ssize_t start, /* Start index */
2000 Py_ssize_t end, /* Stop index */
2001 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00002002 );
2003
2004/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002005 given search direction or -1 if not found. -2 is returned in case
2006 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002007
Martin v. Löwis18e16552006-02-15 17:27:45 +00002008PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002009 PyObject *str, /* String */
2010 PyObject *substr, /* Substring to find */
2011 Py_ssize_t start, /* Start index */
2012 Py_ssize_t end, /* Stop index */
2013 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00002014 );
2015
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02002016#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017/* Like PyUnicode_Find, but search for single character only. */
2018PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
2019 PyObject *str,
2020 Py_UCS4 ch,
2021 Py_ssize_t start,
2022 Py_ssize_t end,
2023 int direction
2024 );
Serhiy Storchaka34d0ac82016-12-27 14:57:39 +02002025#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026
Barry Warsaw51ac5802000-03-20 16:36:48 +00002027/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002028
Martin v. Löwis18e16552006-02-15 17:27:45 +00002029PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002030 PyObject *str, /* String */
2031 PyObject *substr, /* Substring to count */
2032 Py_ssize_t start, /* Start index */
2033 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00002034 );
2035
Barry Warsaw51ac5802000-03-20 16:36:48 +00002036/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00002037 and return the resulting Unicode object. */
2038
Mark Hammond91a681d2002-08-12 07:21:58 +00002039PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002040 PyObject *str, /* String */
2041 PyObject *substr, /* Substring to find */
2042 PyObject *replstr, /* Substring to replace */
2043 Py_ssize_t maxcount /* Max. number of replacements to apply;
2044 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00002045 );
2046
2047/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +02002048 greater than resp.
2049 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002050
Mark Hammond91a681d2002-08-12 07:21:58 +00002051PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002052 PyObject *left, /* Left string */
2053 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00002054 );
2055
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002056#ifndef Py_LIMITED_API
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002057/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +02002058 0 otherwise. The right argument must be ASCII identifier.
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002059 Any error occurs inside will be cleared before return. */
2060
2061PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
2062 PyObject *left, /* Left string */
2063 _Py_Identifier *right /* Right identifier */
2064 );
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002065#endif
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002066
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002067/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
2068 equal, and greater than, respectively. It is best to pass only
2069 ASCII-encoded strings, but the function interprets the input string as
2070 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +02002071 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002072
Martin v. Löwis5b222132007-06-10 09:51:05 +00002073PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2074 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00002075 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00002076 );
2077
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +02002078#ifndef Py_LIMITED_API
2079/* Test whether a unicode is equal to ASCII string. Return 1 if true,
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +02002080 0 otherwise. The right argument must be ASCII-encoded string.
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +02002081 Any error occurs inside will be cleared before return. */
2082
2083PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
2084 PyObject *left,
2085 const char *right /* ASCII-encoded string */
2086 );
2087#endif
2088
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002089/* Rich compare two strings and return one of the following:
2090
2091 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +00002092 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002093 - Py_NotImplemented in case the type combination is unknown
2094
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002095 Possible values for op:
2096
2097 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2098
2099*/
2100
2101PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002102 PyObject *left, /* Left string */
2103 PyObject *right, /* Right string */
2104 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002105 );
2106
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002107/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00002108 the resulting Unicode string. */
2109
Mark Hammond91a681d2002-08-12 07:21:58 +00002110PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002111 PyObject *format, /* Format string */
2112 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00002113 );
2114
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002115/* Checks whether element is contained in container and return 1/0
2116 accordingly.
2117
Martin Pantercc71a792016-04-05 06:19:42 +00002118 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002119 returned in case of an error. */
2120
Mark Hammond91a681d2002-08-12 07:21:58 +00002121PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002122 PyObject *container, /* Container string */
2123 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002124 );
2125
Martin v. Löwis47383402007-08-15 07:32:56 +00002126/* Checks whether argument is a valid identifier. */
2127
2128PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2129
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002130#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002131/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00002132PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002133 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002134 int striptype,
2135 PyObject *sepobj
2136 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002137#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002138
Eric Smitha3b1ac82009-04-03 14:45:06 +00002139/* Using explicit passed-in values, insert the thousands grouping
2140 into the string pointed to by buffer. For the argument descriptions,
2141 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002142#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02002144 PyObject *unicode,
Victor Stinner41a863c2012-02-24 00:37:51 +01002145 Py_ssize_t index,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 Py_ssize_t n_buffer,
2147 void *digits,
2148 Py_ssize_t n_digits,
2149 Py_ssize_t min_width,
2150 const char *grouping,
Victor Stinner41a863c2012-02-24 00:37:51 +01002151 PyObject *thousands_sep,
2152 Py_UCS4 *maxchar);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002153#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002154/* === Characters Type APIs =============================================== */
2155
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002156/* Helper array used by Py_UNICODE_ISSPACE(). */
2157
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002158#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002159PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2160
Guido van Rossumd8225182000-03-10 22:33:05 +00002161/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002162 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00002163
2164 These APIs are implemented in Objects/unicodectype.c.
2165
2166*/
2167
Mark Hammond91a681d2002-08-12 07:21:58 +00002168PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002169 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002170 );
2171
Mark Hammond91a681d2002-08-12 07:21:58 +00002172PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002173 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002174 );
2175
Mark Hammond91a681d2002-08-12 07:21:58 +00002176PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002177 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002178 );
2179
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002180PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002181 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002182 );
2183
2184PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002185 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002186 );
2187
Mark Hammond91a681d2002-08-12 07:21:58 +00002188PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002189 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002190 );
2191
Mark Hammond91a681d2002-08-12 07:21:58 +00002192PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002193 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002194 );
2195
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002196PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2197 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002198 ) /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +00002199
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002200PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2201 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002202 ) /* Py_DEPRECATED(3.3) */;
Guido van Rossumd8225182000-03-10 22:33:05 +00002203
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002204PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2205 Py_UCS4 ch /* Unicode character */
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002206 ) Py_DEPRECATED(3.3);
Guido van Rossumd8225182000-03-10 22:33:05 +00002207
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002208PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2209 Py_UCS4 ch, /* Unicode character */
2210 Py_UCS4 *res
2211 );
2212
2213PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2214 Py_UCS4 ch, /* Unicode character */
2215 Py_UCS4 *res
2216 );
2217
2218PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2219 Py_UCS4 ch, /* Unicode character */
2220 Py_UCS4 *res
2221 );
2222
Benjamin Petersond5890c82012-01-14 13:23:30 -05002223PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2224 Py_UCS4 ch, /* Unicode character */
2225 Py_UCS4 *res
2226 );
2227
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002228PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002229 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002230 );
2231
2232PyAPI_FUNC(int) _PyUnicode_IsCased(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002233 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002234 );
2235
Mark Hammond91a681d2002-08-12 07:21:58 +00002236PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002237 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002238 );
2239
Mark Hammond91a681d2002-08-12 07:21:58 +00002240PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002241 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002242 );
2243
Mark Hammond91a681d2002-08-12 07:21:58 +00002244PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002245 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002246 );
2247
Mark Hammond91a681d2002-08-12 07:21:58 +00002248PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002249 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002250 );
2251
Mark Hammond91a681d2002-08-12 07:21:58 +00002252PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002253 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002254 );
2255
Mark Hammond91a681d2002-08-12 07:21:58 +00002256PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002257 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002258 );
2259
Georg Brandl559e5d72008-06-11 18:37:52 +00002260PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002261 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002262 );
2263
Mark Hammond91a681d2002-08-12 07:21:58 +00002264PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002265 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002266 );
2267
Victor Stinneref8d95c2010-08-16 22:03:11 +00002268PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2269 const Py_UNICODE *u
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002270 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002271
2272PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002273 Py_UNICODE *s1,
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002274 const Py_UNICODE *s2) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002275
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002276PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002277 Py_UNICODE *s1, const Py_UNICODE *s2) Py_DEPRECATED(3.3);
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002278
Martin v. Löwis5b222132007-06-10 09:51:05 +00002279PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002280 Py_UNICODE *s1,
2281 const Py_UNICODE *s2,
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002282 size_t n) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002283
2284PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002285 const Py_UNICODE *s1,
2286 const Py_UNICODE *s2
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002287 ) Py_DEPRECATED(3.3);
Victor Stinneref8d95c2010-08-16 22:03:11 +00002288
2289PyAPI_FUNC(int) Py_UNICODE_strncmp(
2290 const Py_UNICODE *s1,
2291 const Py_UNICODE *s2,
2292 size_t n
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002293 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002294
2295PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002296 const Py_UNICODE *s,
2297 Py_UNICODE c
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002298 ) Py_DEPRECATED(3.3);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002299
Victor Stinner331ea922010-08-10 16:37:20 +00002300PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002301 const Py_UNICODE *s,
2302 Py_UNICODE c
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002303 ) Py_DEPRECATED(3.3);
Victor Stinner331ea922010-08-10 16:37:20 +00002304
Ethan Furmanb95b5612015-01-23 20:05:18 -08002305PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2306
Victor Stinner71133ff2010-09-01 23:43:53 +00002307/* Create a copy of a unicode string ending with a nul character. Return NULL
2308 and raise a MemoryError exception on memory allocation failure, otherwise
2309 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2310
Victor Stinner46408602010-09-03 16:18:00 +00002311PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002312 PyObject *unicode
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002313 ) Py_DEPRECATED(3.3);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002314#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002315
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002316#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002317PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002318 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002319 int check_content);
T. Woutersa00c3fd2017-03-31 09:14:41 -07002320#elif !defined(NDEBUG)
2321/* For asserts that call _PyUnicode_CheckConsistency(), which would
2322 * otherwise be a problem when building with asserts but without Py_DEBUG. */
2323#define _PyUnicode_CheckConsistency(op, check_content) PyUnicode_Check(op)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002324#endif
2325
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002326#ifndef Py_LIMITED_API
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002327/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2328PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2329/* Clear all static strings. */
2330PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2331
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002332/* Fast equality check when the inputs are known to be exact unicode types
2333 and where the hash values are equal (i.e. a very probable match) */
2334PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002335#endif /* !Py_LIMITED_API */
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002336
Guido van Rossumd8225182000-03-10 22:33:05 +00002337#ifdef __cplusplus
2338}
2339#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002340#endif /* !Py_UNICODEOBJECT_H */