blob: 6fdcd7bd69529c552332c92e5abf4b7a9b836bed [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200291 - PyUnicode_WCHAR_KIND (0):
292
293 * character type = wchar_t (16 or 32 bits, depending on the
294 platform)
295
296 - PyUnicode_1BYTE_KIND (1):
297
298 * character type = Py_UCS1 (8 bits, unsigned)
299 * if ascii is 1, at least one character must be in range
300 U+80-U+FF, otherwise all characters must be in range U+00-U+7F
301
302 - PyUnicode_2BYTE_KIND (2):
303
304 * character type = Py_UCS2 (16 bits, unsigned)
305 * at least one character must be in range U+0100-U+1FFFF
306
307 - PyUnicode_4BYTE_KIND (3):
308
309 * character type = Py_UCS4 (32 bits, unsigned)
310 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200311 */
312 unsigned int kind:2;
313 /* Compact is with respect to the allocation scheme. Compact unicode
314 objects only require one memory block while non-compact objects use
315 one block for the PyUnicodeObject struct and another for its data
316 buffer. */
317 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
319 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
320 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 unsigned int ascii:1;
322 /* The ready flag indicates whether the object layout is initialized
323 completely. This means that this is either a compact object, or
324 the data pointer is filled out. The bit is redundant, and helps
325 to minimize the test in PyUnicode_IS_READY(). */
326 unsigned int ready:1;
327 } state;
328 wchar_t *wstr; /* wchar_t representation (null-terminated) */
329} PyASCIIObject;
330
331/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200332 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 immediately follow the structure. */
334typedef struct {
335 PyASCIIObject _base;
336 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
337 * terminating \0. */
338 char *utf8; /* UTF-8 representation (null-terminated) */
339 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
340 * surrogates count as two code points. */
341} PyCompactUnicodeObject;
342
343/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
344 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200345 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346typedef struct {
347 PyCompactUnicodeObject _base;
348 union {
349 void *any;
350 Py_UCS1 *latin1;
351 Py_UCS2 *ucs2;
352 Py_UCS4 *ucs4;
353 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000354} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000355#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000356
Mark Hammond91a681d2002-08-12 07:21:58 +0000357PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000358PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000359
Thomas Wouters27d517b2007-02-25 20:39:11 +0000360#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000361 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
362#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000363
364/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000365#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366
367#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200368 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 ((PyASCIIObject*)op)->length : \
370 ((PyCompactUnicodeObject*)op)->wstr_length)
371
372/* Returns the deprecated Py_UNICODE representation's size in code units
373 (this includes surrogate pairs as 2 units).
374 If the Py_UNICODE representation is not available, it will be computed
375 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
376
Guido van Rossumd8225182000-03-10 22:33:05 +0000377#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200378 (assert(PyUnicode_Check(op)), \
379 (((PyASCIIObject *)(op))->wstr) ? \
380 PyUnicode_WSTR_LENGTH(op) : \
381 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
382 PyUnicode_WSTR_LENGTH(op)))
383
Guido van Rossumd8225182000-03-10 22:33:05 +0000384#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200385 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
386
387/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
388 representation on demand. Using this macro is very inefficient now,
389 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
390 use PyUnicode_WRITE() and PyUnicode_READ(). */
391
Guido van Rossumd8225182000-03-10 22:33:05 +0000392#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393 (assert(PyUnicode_Check(op)), \
394 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
395 PyUnicode_AsUnicode((PyObject *)(op)))
396
Guido van Rossumd8225182000-03-10 22:33:05 +0000397#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398 ((const char *)(PyUnicode_AS_UNICODE(op)))
399
400
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200401/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402
403/* Values for PyUnicodeObject.state: */
404
405/* Interning state. */
406#define SSTATE_NOT_INTERNED 0
407#define SSTATE_INTERNED_MORTAL 1
408#define SSTATE_INTERNED_IMMORTAL 2
409
Victor Stinnera3b334d2011-10-03 13:53:37 +0200410/* Return true if the string contains only ASCII characters, or 0 if not. The
411 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
412 or Ready calls are performed. */
413#define PyUnicode_IS_ASCII(op) \
414 (((PyASCIIObject*)op)->state.ascii)
415
416/* Return true if the string is compact or 0 if not.
417 No type checks or Ready calls are performed. */
418#define PyUnicode_IS_COMPACT(op) \
419 (((PyASCIIObject*)(op))->state.compact)
420
421/* Return true if the string is a compact ASCII string (use PyASCIIObject
422 structure), or 0 if not. No type checks or Ready calls are performed. */
423#define PyUnicode_IS_COMPACT_ASCII(op) \
424 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200425
426/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200427 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428 has not been called yet. */
429#define PyUnicode_WCHAR_KIND 0
430
431/* Return values of the PyUnicode_KIND() macro: */
432
433#define PyUnicode_1BYTE_KIND 1
434#define PyUnicode_2BYTE_KIND 2
435#define PyUnicode_4BYTE_KIND 3
436
437
438/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200439 this can be 1, 2 or 4.
440
441 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#define PyUnicode_CHARACTER_SIZE(op) \
443 (1 << (PyUnicode_KIND(op) - 1))
444
Georg Brandl4975a9b2011-10-05 16:12:21 +0200445/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446 Py_UCS2, or Py_UCS4 for direct character access.
447 No checks are performed, use PyUnicode_CHARACTER_SIZE or
448 PyUnicode_KIND() before to ensure these will work correctly. */
449
450#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
451#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
452#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
453
Victor Stinner157f83f2011-09-28 21:41:31 +0200454/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define PyUnicode_KIND(op) \
456 (assert(PyUnicode_Check(op)), \
457 assert(PyUnicode_IS_READY(op)), \
458 ((PyASCIIObject *)(op))->state.kind)
459
Victor Stinner157f83f2011-09-28 21:41:31 +0200460/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461#define _PyUnicode_COMPACT_DATA(op) \
462 (PyUnicode_IS_COMPACT_ASCII(op) ? \
463 ((void*)((PyASCIIObject*)(op) + 1)) : \
464 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
465
466#define _PyUnicode_NONCOMPACT_DATA(op) \
467 (assert(((PyUnicodeObject*)(op))->data.any), \
468 ((((PyUnicodeObject *)(op))->data.any)))
469
470#define PyUnicode_DATA(op) \
471 (assert(PyUnicode_Check(op)), \
472 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
473 _PyUnicode_NONCOMPACT_DATA(op))
474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200476 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477
Victor Stinner4584a5b2011-10-01 02:39:37 +0200478 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200479#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
480
481/* In the access macros below, "kind" may be evaluated more than once.
482 All other macro parameters are evaluated exactly once, so it is safe
483 to put side effects into them (such as increasing the index). */
484
485/* Write into the canonical representation, this macro does not do any sanity
486 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200487 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200488 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200489 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490#define PyUnicode_WRITE(kind, data, index, value) \
491 do { \
492 switch ((kind)) { \
493 case PyUnicode_1BYTE_KIND: { \
494 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
495 break; \
496 } \
497 case PyUnicode_2BYTE_KIND: { \
498 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
499 break; \
500 } \
501 default: { \
502 assert((kind) == PyUnicode_4BYTE_KIND); \
503 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
504 } \
505 } \
506 } while (0)
507
Georg Brandl07de3252011-10-05 16:47:38 +0200508/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200509 or ready calls are performed. */
510#define PyUnicode_READ(kind, data, index) \
511 ((Py_UCS4) \
512 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200513 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200514 ((kind) == PyUnicode_2BYTE_KIND ? \
515 ((const Py_UCS2 *)(data))[(index)] : \
516 ((const Py_UCS4 *)(data))[(index)] \
517 ) \
518 ))
519
520/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
521 calls PyUnicode_KIND() and might call it twice. For single reads, use
522 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
523 cache kind and use PyUnicode_READ instead. */
524#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200525 (assert(PyUnicode_Check(unicode)), \
526 assert(PyUnicode_IS_READY(unicode)), \
527 (Py_UCS4) \
528 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
529 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
530 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
531 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
532 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
533 ) \
534 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535
536/* Returns the length of the unicode string. The caller has to make sure that
537 the string has it's canonical representation set before calling
538 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
539#define PyUnicode_GET_LENGTH(op) \
540 (assert(PyUnicode_Check(op)), \
541 assert(PyUnicode_IS_READY(op)), \
542 ((PyASCIIObject *)(op))->length)
543
544
545/* Fast check to determine whether an object is ready. Equivalent to
546 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
547
548#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
549
Victor Stinnera3b334d2011-10-03 13:53:37 +0200550/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200552 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 Returns 0 on success and -1 on errors. */
554#define PyUnicode_READY(op) \
555 (assert(PyUnicode_Check(op)), \
556 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200557 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559/* Return a maximum character value which is suitable for creating another
560 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200561 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562#define PyUnicode_MAX_CHAR_VALUE(op) \
563 (assert(PyUnicode_IS_READY(op)), \
564 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
565 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
566 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
567 (0x7fU) : (0xffU) \
568 ) : \
569 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
570 (0xffffU) : (0x10ffffU) \
571 ))))
572
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000573#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000574
575/* --- Constants ---------------------------------------------------------- */
576
577/* This Unicode character will be used as replacement character during
578 decoding if the errors argument is set to "replace". Note: the
579 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
580 Unicode 3.0. */
581
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200582#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000583
584/* === Public API ========================================================= */
585
586/* --- Plain Py_UNICODE --------------------------------------------------- */
587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200588/* With PEP 393, this is the recommended way to allocate a new unicode object.
589 This function will allocate the object and its buffer in a single memory
590 block. Objects created using this function are not resizable. */
591#ifndef Py_LIMITED_API
592PyAPI_FUNC(PyObject*) PyUnicode_New(
593 Py_ssize_t size, /* Number of code points in the new string */
594 Py_UCS4 maxchar /* maximum code point value in the string */
595 );
596#endif
597
Victor Stinnerd8f65102011-09-29 19:43:17 +0200598/* Initializes the canonical string representation from a the deprecated
599 wstr/Py_UNICODE representation. This function is used to convert Unicode
600 objects which were created using the old API to the new flexible format
601 introduced with PEP 393.
602
603 Don't call this function directly, use the public PyUnicode_READY() macro
604 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605#ifndef Py_LIMITED_API
606PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200607 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608 );
609#endif
610
Victor Stinner034f6cf2011-09-30 02:26:44 +0200611/* Get a copy of a Unicode string. */
612PyAPI_FUNC(PyObject*) PyUnicode_Copy(
613 PyObject *unicode
614 );
615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200617 character conversion when necessary and falls back to memcpy if possible.
618
Victor Stinnera0702ab2011-09-29 14:14:38 +0200619 Fail if to is too small (smaller than how_many or smaller than
620 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
621 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200622
623 Return the number of written character, or return -1 and raise an exception
624 on error.
625
626 Pseudo-code:
627
628 how_many = min(how_many, len(from) - from_start)
629 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
630 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200631
632 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200633 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200635PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200636 PyObject *to,
637 Py_ssize_t to_start,
638 PyObject *from,
639 Py_ssize_t from_start,
640 Py_ssize_t how_many
641 );
642#endif
643
Guido van Rossumd8225182000-03-10 22:33:05 +0000644/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000646
647 u may be NULL which causes the contents to be undefined. It is the
648 user's responsibility to fill in the needed data afterwards. Note
649 that modifying the Unicode object contents after construction is
650 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000651
652 The buffer is copied into the new object. */
653
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000654#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000655PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000656 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000657 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000658 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000659#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000660
Georg Brandl952867a2010-06-27 10:17:12 +0000661/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000663 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000664 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000665 );
666
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000667/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200668 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000669PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000670 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000671 );
672
Victor Stinnerb9275c12011-10-05 14:01:42 +0200673/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
674 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675#ifndef Py_LIMITED_API
676PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
677 int kind,
678 const void *buffer,
679 Py_ssize_t size);
680#endif
681
682PyAPI_FUNC(PyObject*) PyUnicode_Substring(
683 PyObject *str,
684 Py_ssize_t start,
685 Py_ssize_t end);
686
687/* Copy the string into a UCS4 buffer including the null character is copy_null
688 is set. Return NULL and raise an exception on error. Raise a ValueError if
689 the buffer is smaller than the string. Return buffer on success.
690
691 buflen is the length of the buffer in (Py_UCS4) characters. */
692PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
693 PyObject *unicode,
694 Py_UCS4* buffer,
695 Py_ssize_t buflen,
696 int copy_null);
697
698/* Copy the string into a UCS4 buffer. A new buffer is allocated using
699 * PyMem_Malloc; if this fails, NULL is returned with a memory error
700 exception set. */
701PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
702
Guido van Rossumd8225182000-03-10 22:33:05 +0000703/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 Py_UNICODE buffer.
705 If the wchar_t/Py_UNICODE representation is not yet available, this
706 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000707
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000708#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000709PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000711 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000712#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714/* Return a read-only pointer to the Unicode object's internal
715 Py_UNICODE buffer and save the length at size.
716 If the wchar_t/Py_UNICODE representation is not yet available, this
717 function will calculate it. */
718
719#ifndef Py_LIMITED_API
720PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
721 PyObject *unicode, /* Unicode object */
722 Py_ssize_t *size /* location where to save the length */
723 );
724#endif
725
Guido van Rossumd8225182000-03-10 22:33:05 +0000726/* Get the length of the Unicode object. */
727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200728PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
729 PyObject *unicode
730);
731
Victor Stinner157f83f2011-09-28 21:41:31 +0200732/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733 string representation. */
734
Martin v. Löwis18e16552006-02-15 17:27:45 +0000735PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000737 );
738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739/* Read a character from the string. */
740
741PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
742 PyObject *unicode,
743 Py_ssize_t index
744 );
745
746/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200747 PyUnicode_New, must not be shared, and must not have been hashed yet.
748
749 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750
751PyAPI_FUNC(int) PyUnicode_WriteChar(
752 PyObject *unicode,
753 Py_ssize_t index,
754 Py_UCS4 character
755 );
756
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000757#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000758/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000759PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000760#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000761
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200762/* Resize an Unicode object allocated by the legacy API (e.g.
763 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
764 PyUnicode_New) cannot be resized by this function.
765
766 The length is a number of Py_UNICODE characters (and not the number of code
767 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000768
769 *unicode is modified to point to the new (resized) object and 0
770 returned on success.
771
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200772 If the refcount on the object is 1, the function resizes the string in
773 place, which is usually faster than allocating a new string (and copy
774 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000775
776 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000778
Mark Hammond91a681d2002-08-12 07:21:58 +0000779PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 PyObject **unicode, /* Pointer to the Unicode object */
781 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000782 );
783
Guido van Rossumd8225182000-03-10 22:33:05 +0000784/* Coerce obj to an Unicode object and return a reference with
785 *incremented* refcount.
786
787 Coercion is done in the following way:
788
Georg Brandl952867a2010-06-27 10:17:12 +0000789 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000790 under the assumptions that they contain data using the UTF-8
791 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000792
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000793 2. All other objects (including Unicode objects) raise an
794 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000795
796 The API returns NULL in case of an error. The caller is responsible
797 for decref'ing the returned objects.
798
799*/
800
Mark Hammond91a681d2002-08-12 07:21:58 +0000801PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000803 const char *encoding, /* encoding */
804 const char *errors /* error handling */
805 );
806
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000807/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000808 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000810 Unicode objects are passed back as-is (subclasses are converted to
811 true Unicode objects), all other objects are delegated to
812 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000813 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000814
815 The API returns NULL in case of an error. The caller is responsible
816 for decref'ing the returned objects.
817
818*/
819
Mark Hammond91a681d2002-08-12 07:21:58 +0000820PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 );
823
Victor Stinner1205f272010-09-11 00:54:47 +0000824PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
825 const char *format, /* ASCII-encoded string */
826 va_list vargs
827 );
828PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
829 const char *format, /* ASCII-encoded string */
830 ...
831 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000832
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000833#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000834/* Format the object based on the format_spec, as defined in PEP 3101
835 (Advanced String Formatting). */
836PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837 PyObject *format_spec,
838 Py_ssize_t start,
839 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000840#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000841
Walter Dörwald16807132007-05-25 13:52:07 +0000842PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
843PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000844PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
845 const char *u /* UTF-8 encoded string */
846 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000847#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000848PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000849#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000850
851/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852#define PyUnicode_CHECK_INTERNED(op) \
853 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000854
Guido van Rossumd8225182000-03-10 22:33:05 +0000855/* --- wchar_t support for platforms which support it --------------------- */
856
857#ifdef HAVE_WCHAR_H
858
Georg Brandl952867a2010-06-27 10:17:12 +0000859/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000860 size.
861
862 The buffer is copied into the new object. */
863
Mark Hammond91a681d2002-08-12 07:21:58 +0000864PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000865 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 );
868
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000869/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000870 most size wchar_t characters are copied.
871
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000872 Note that the resulting wchar_t string may or may not be
873 0-terminated. It is the responsibility of the caller to make sure
874 that the wchar_t string is 0-terminated in case this is required by
875 the application.
876
877 Returns the number of wchar_t characters copied (excluding a
878 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000879 error. */
880
Martin v. Löwis18e16552006-02-15 17:27:45 +0000881PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000882 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000883 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000884 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000885 );
886
Victor Stinner137c34c2010-09-29 10:25:54 +0000887/* Convert the Unicode object to a wide character string. The output string
888 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200889 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000890
891 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
892 on success. On error, returns NULL, *size is undefined and raises a
893 MemoryError. */
894
895PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000896 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000897 Py_ssize_t *size /* number of characters of the result */
898 );
899
Victor Stinner9f789e72011-10-01 03:57:28 +0200900#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200902#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903
Guido van Rossumd8225182000-03-10 22:33:05 +0000904#endif
905
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000906/* --- Unicode ordinals --------------------------------------------------- */
907
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908/* Create a Unicode Object from the given Unicode code point ordinal.
909
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000910 The ordinal must be in range(0x10000) on narrow Python builds
911 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
912 raised in case it is not.
913
914*/
915
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000916PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000918/* --- Free-list management ----------------------------------------------- */
919
920/* Clear the free list used by the Unicode implementation.
921
922 This can be used to release memory used for objects on the free
923 list back to the Python memory allocator.
924
925*/
926
927PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
928
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000929/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000930
931 Many of these APIs take two arguments encoding and errors. These
932 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000933 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000934
Georg Brandl952867a2010-06-27 10:17:12 +0000935 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000936
937 Error handling is set by errors which may also be set to NULL
938 meaning to use the default handling defined for the codec. Default
939 error handling for all builtin codecs is "strict" (ValueErrors are
940 raised).
941
942 The codecs all use a similar interface. Only deviation from the
943 generic ones are documented.
944
945*/
946
Fred Drakecb093fe2000-05-09 19:51:53 +0000947/* --- Manage the default encoding ---------------------------------------- */
948
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000949/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000950 Unicode object unicode and the size of the encoded representation
951 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000952
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000953 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000954
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200955 This function caches the UTF-8 encoded string in the unicodeobject
956 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 when the unicodeobject is deallocated.
958
959 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
960 support the previous internal function with the same behaviour.
961
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000962 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000963 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000964
965 *** If you need to access the Unicode object as UTF-8 bytes string,
966 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000967*/
968
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000969#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000971 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000972 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000974#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000975
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000976/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000977 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
980 in the unicodeobject.
981
982 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
983 support the previous internal function with the same behaviour.
984
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000985 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000986 extracted from the returned data.
987
988 *** This API is for interpreter INTERNAL USE ONLY and will likely
989 *** be removed or changed for Python 3.1.
990
991 *** If you need to access the Unicode object as UTF-8 bytes string,
992 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000993
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000994*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000995
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000996#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
998#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000999#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001000
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001001/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001002
Mark Hammond91a681d2002-08-12 07:21:58 +00001003PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001004
Guido van Rossumd8225182000-03-10 22:33:05 +00001005/* --- Generic Codecs ----------------------------------------------------- */
1006
1007/* Create a Unicode object by decoding the encoded string s of the
1008 given size. */
1009
Mark Hammond91a681d2002-08-12 07:21:58 +00001010PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001012 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001013 const char *encoding, /* encoding */
1014 const char *errors /* error handling */
1015 );
1016
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001017/* Decode a Unicode object unicode and return the result as Python
1018 object. */
1019
1020PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 PyObject *unicode, /* Unicode object */
1022 const char *encoding, /* encoding */
1023 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001024 );
1025
1026/* Decode a Unicode object unicode and return the result as Unicode
1027 object. */
1028
1029PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 PyObject *unicode, /* Unicode object */
1031 const char *encoding, /* encoding */
1032 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001033 );
1034
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001035/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 Python string object. */
1037
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001038#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001039PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 const char *encoding, /* encoding */
1043 const char *errors /* error handling */
1044 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001045#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001046
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001047/* Encodes a Unicode object and returns the result as Python
1048 object. */
1049
1050PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 PyObject *unicode, /* Unicode object */
1052 const char *encoding, /* encoding */
1053 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001054 );
1055
Guido van Rossumd8225182000-03-10 22:33:05 +00001056/* Encodes a Unicode object and returns the result as Python string
1057 object. */
1058
Mark Hammond91a681d2002-08-12 07:21:58 +00001059PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 PyObject *unicode, /* Unicode object */
1061 const char *encoding, /* encoding */
1062 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001063 );
1064
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001065/* Encodes a Unicode object and returns the result as Unicode
1066 object. */
1067
1068PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 PyObject *unicode, /* Unicode object */
1070 const char *encoding, /* encoding */
1071 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001072 );
1073
1074/* Build an encoding map. */
1075
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001076PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1077 PyObject* string /* 256 character map */
1078 );
1079
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080/* --- UTF-7 Codecs ------------------------------------------------------- */
1081
Mark Hammond91a681d2002-08-12 07:21:58 +00001082PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001083 const char *string, /* UTF-7 encoded string */
1084 Py_ssize_t length, /* size of string */
1085 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 );
1087
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001088PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 const char *string, /* UTF-7 encoded string */
1090 Py_ssize_t length, /* size of string */
1091 const char *errors, /* error handling */
1092 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001093 );
1094
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001095#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001096PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 const Py_UNICODE *data, /* Unicode char buffer */
1098 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1099 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1100 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1101 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001103#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001104
Guido van Rossumd8225182000-03-10 22:33:05 +00001105/* --- UTF-8 Codecs ------------------------------------------------------- */
1106
Mark Hammond91a681d2002-08-12 07:21:58 +00001107PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 const char *string, /* UTF-8 encoded string */
1109 Py_ssize_t length, /* size of string */
1110 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001111 );
1112
Walter Dörwald69652032004-09-07 20:24:22 +00001113PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001114 const char *string, /* UTF-8 encoded string */
1115 Py_ssize_t length, /* size of string */
1116 const char *errors, /* error handling */
1117 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001118 );
1119
Mark Hammond91a681d2002-08-12 07:21:58 +00001120PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001122 );
1123
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001124#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1126 PyObject *unicode,
1127 const char *errors);
1128
Mark Hammond91a681d2002-08-12 07:21:58 +00001129PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 const Py_UNICODE *data, /* Unicode char buffer */
1131 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1132 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001133 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001134#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001135
Walter Dörwald41980ca2007-08-16 21:55:45 +00001136/* --- UTF-32 Codecs ------------------------------------------------------ */
1137
1138/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1139 the corresponding Unicode object.
1140
1141 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001142 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001143
1144 If byteorder is non-NULL, the decoder starts decoding using the
1145 given byte order:
1146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 *byteorder == -1: little endian
1148 *byteorder == 0: native order
1149 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001150
1151 In native mode, the first four bytes of the stream are checked for a
1152 BOM mark. If found, the BOM mark is analysed, the byte order
1153 adjusted and the BOM skipped. In the other modes, no BOM mark
1154 interpretation is done. After completion, *byteorder is set to the
1155 current byte order at the end of input data.
1156
1157 If byteorder is NULL, the codec starts in native order mode.
1158
1159*/
1160
1161PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 const char *string, /* UTF-32 encoded string */
1163 Py_ssize_t length, /* size of string */
1164 const char *errors, /* error handling */
1165 int *byteorder /* pointer to byteorder to use
1166 0=native;-1=LE,1=BE; updated on
1167 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001168 );
1169
1170PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 const char *string, /* UTF-32 encoded string */
1172 Py_ssize_t length, /* size of string */
1173 const char *errors, /* error handling */
1174 int *byteorder, /* pointer to byteorder to use
1175 0=native;-1=LE,1=BE; updated on
1176 exit */
1177 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001178 );
1179
1180/* Returns a Python string using the UTF-32 encoding in native byte
1181 order. The string always starts with a BOM mark. */
1182
1183PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001184 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001185 );
1186
1187/* Returns a Python string object holding the UTF-32 encoded value of
1188 the Unicode data.
1189
1190 If byteorder is not 0, output is written according to the following
1191 byte order:
1192
1193 byteorder == -1: little endian
1194 byteorder == 0: native byte order (writes a BOM mark)
1195 byteorder == 1: big endian
1196
1197 If byteorder is 0, the output string will always start with the
1198 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1199 prepended.
1200
1201*/
1202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001203#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001204PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 const Py_UNICODE *data, /* Unicode char buffer */
1206 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1207 const char *errors, /* error handling */
1208 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001209 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001210#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001211
Guido van Rossumd8225182000-03-10 22:33:05 +00001212/* --- UTF-16 Codecs ------------------------------------------------------ */
1213
Guido van Rossum9e896b32000-04-05 20:11:21 +00001214/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001215 the corresponding Unicode object.
1216
1217 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001218 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001219
1220 If byteorder is non-NULL, the decoder starts decoding using the
1221 given byte order:
1222
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 *byteorder == -1: little endian
1224 *byteorder == 0: native order
1225 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001226
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001227 In native mode, the first two bytes of the stream are checked for a
1228 BOM mark. If found, the BOM mark is analysed, the byte order
1229 adjusted and the BOM skipped. In the other modes, no BOM mark
1230 interpretation is done. After completion, *byteorder is set to the
1231 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001232
1233 If byteorder is NULL, the codec starts in native order mode.
1234
1235*/
1236
Mark Hammond91a681d2002-08-12 07:21:58 +00001237PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 const char *string, /* UTF-16 encoded string */
1239 Py_ssize_t length, /* size of string */
1240 const char *errors, /* error handling */
1241 int *byteorder /* pointer to byteorder to use
1242 0=native;-1=LE,1=BE; updated on
1243 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001244 );
1245
Walter Dörwald69652032004-09-07 20:24:22 +00001246PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001247 const char *string, /* UTF-16 encoded string */
1248 Py_ssize_t length, /* size of string */
1249 const char *errors, /* error handling */
1250 int *byteorder, /* pointer to byteorder to use
1251 0=native;-1=LE,1=BE; updated on
1252 exit */
1253 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001254 );
1255
Guido van Rossumd8225182000-03-10 22:33:05 +00001256/* Returns a Python string using the UTF-16 encoding in native byte
1257 order. The string always starts with a BOM mark. */
1258
Mark Hammond91a681d2002-08-12 07:21:58 +00001259PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001261 );
1262
1263/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001264 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001265
1266 If byteorder is not 0, output is written according to the following
1267 byte order:
1268
1269 byteorder == -1: little endian
1270 byteorder == 0: native byte order (writes a BOM mark)
1271 byteorder == 1: big endian
1272
1273 If byteorder is 0, the output string will always start with the
1274 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1275 prepended.
1276
1277 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1278 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001279 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001280
1281*/
1282
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001283#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001284PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 const Py_UNICODE *data, /* Unicode char buffer */
1286 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1287 const char *errors, /* error handling */
1288 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001289 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001290#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001291
1292/* --- Unicode-Escape Codecs ---------------------------------------------- */
1293
Mark Hammond91a681d2002-08-12 07:21:58 +00001294PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 const char *string, /* Unicode-Escape encoded string */
1296 Py_ssize_t length, /* size of string */
1297 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001298 );
1299
Mark Hammond91a681d2002-08-12 07:21:58 +00001300PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 );
1303
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001304#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001305PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 const Py_UNICODE *data, /* Unicode char buffer */
1307 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001308 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001309#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001310
1311/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1312
Mark Hammond91a681d2002-08-12 07:21:58 +00001313PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 const char *string, /* Raw-Unicode-Escape encoded string */
1315 Py_ssize_t length, /* size of string */
1316 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001317 );
1318
Mark Hammond91a681d2002-08-12 07:21:58 +00001319PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001321 );
1322
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001323#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001324PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 const Py_UNICODE *data, /* Unicode char buffer */
1326 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001327 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001328#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001329
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001330/* --- Unicode Internal Codec ---------------------------------------------
1331
1332 Only for internal use in _codecsmodule.c */
1333
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001334#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001335PyObject *_PyUnicode_DecodeUnicodeInternal(
1336 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001337 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001338 const char *errors
1339 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001340#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001341
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001343
1344 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1345
1346*/
1347
Mark Hammond91a681d2002-08-12 07:21:58 +00001348PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 const char *string, /* Latin-1 encoded string */
1350 Py_ssize_t length, /* size of string */
1351 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001352 );
1353
Mark Hammond91a681d2002-08-12 07:21:58 +00001354PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001356 );
1357
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001358#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1360 PyObject* unicode,
1361 const char* errors);
1362
Mark Hammond91a681d2002-08-12 07:21:58 +00001363PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001364 const Py_UNICODE *data, /* Unicode char buffer */
1365 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1366 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001367 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001368#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001369
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001371
1372 Only 7-bit ASCII data is excepted. All other codes generate errors.
1373
1374*/
1375
Mark Hammond91a681d2002-08-12 07:21:58 +00001376PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 const char *string, /* ASCII encoded string */
1378 Py_ssize_t length, /* size of string */
1379 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001380 );
1381
Mark Hammond91a681d2002-08-12 07:21:58 +00001382PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001384 );
1385
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001386#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1388 PyObject* unicode,
1389 const char* errors);
1390
Mark Hammond91a681d2002-08-12 07:21:58 +00001391PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001392 const Py_UNICODE *data, /* Unicode char buffer */
1393 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1394 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001395 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001396#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001397
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001398/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001399
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001401
1402 Decoding mappings must map single string characters to single
1403 Unicode characters, integers (which are then interpreted as Unicode
1404 ordinals) or None (meaning "undefined mapping" and causing an
1405 error).
1406
1407 Encoding mappings must map single Unicode characters to single
1408 string characters, integers (which are then interpreted as Latin-1
1409 ordinals) or None (meaning "undefined mapping" and causing an
1410 error).
1411
1412 If a character lookup fails with a LookupError, the character is
1413 copied as-is meaning that its ordinal value will be interpreted as
1414 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1415 to contain those mappings which map characters to different code
1416 points.
1417
1418*/
1419
Mark Hammond91a681d2002-08-12 07:21:58 +00001420PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001421 const char *string, /* Encoded string */
1422 Py_ssize_t length, /* size of string */
1423 PyObject *mapping, /* character mapping
1424 (char ordinal -> unicode ordinal) */
1425 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001426 );
1427
Mark Hammond91a681d2002-08-12 07:21:58 +00001428PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 PyObject *unicode, /* Unicode object */
1430 PyObject *mapping /* character mapping
1431 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001432 );
1433
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001434#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001435PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001436 const Py_UNICODE *data, /* Unicode char buffer */
1437 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1438 PyObject *mapping, /* character mapping
1439 (unicode ordinal -> char ordinal) */
1440 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001441 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001442#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001443
1444/* Translate a Py_UNICODE buffer of the given length by applying a
1445 character mapping table to it and return the resulting Unicode
1446 object.
1447
1448 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001449 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001450
1451 Mapping tables may be dictionaries or sequences. Unmapped character
1452 ordinals (ones which cause a LookupError) are left untouched and
1453 are copied as-is.
1454
1455*/
1456
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001457#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001458PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001459 const Py_UNICODE *data, /* Unicode char buffer */
1460 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1461 PyObject *table, /* Translate table */
1462 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001463 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001464#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001465
Victor Stinner99b95382011-07-04 14:23:54 +02001466#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001467
Guido van Rossumefec1152000-03-28 02:01:15 +00001468/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001469
Mark Hammond91a681d2002-08-12 07:21:58 +00001470PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001471 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001473 const char *errors /* error handling */
1474 );
1475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001476PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1477 const char *string, /* MBCS encoded string */
1478 Py_ssize_t length, /* size of string */
1479 const char *errors, /* error handling */
1480 Py_ssize_t *consumed /* bytes consumed */
1481 );
1482
Mark Hammond91a681d2002-08-12 07:21:58 +00001483PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001484 PyObject *unicode /* Unicode object */
1485 );
1486
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001487#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001488PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001489 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001490 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001491 const char *errors /* error handling */
1492 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001493#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001494
Victor Stinner99b95382011-07-04 14:23:54 +02001495#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001496
Guido van Rossum9e896b32000-04-05 20:11:21 +00001497/* --- Decimal Encoder ---------------------------------------------------- */
1498
1499/* Takes a Unicode string holding a decimal value and writes it into
1500 an output buffer using standard ASCII digit codes.
1501
1502 The output buffer has to provide at least length+1 bytes of storage
1503 area. The output string is 0-terminated.
1504
1505 The encoder converts whitespace to ' ', decimal characters to their
1506 corresponding ASCII digit and all other Latin-1 characters except
1507 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1508 are treated as errors. This includes embedded NULL bytes.
1509
1510 Error handling is defined by the errors argument:
1511
1512 NULL or "strict": raise a ValueError
1513 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001514 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001515 "replace": replaces illegal characters with '?'
1516
1517 Returns 0 on success, -1 on failure.
1518
1519*/
1520
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001521#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001522PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001523 Py_UNICODE *s, /* Unicode buffer */
1524 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1525 char *output, /* Output buffer; must have size >= length */
1526 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001527 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001528#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001529
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001530/* Transforms code points that have decimal digit property to the
1531 corresponding ASCII digit code points.
1532
1533 Returns a new Unicode string on success, NULL on failure.
1534*/
1535
Georg Brandlb5503082010-12-05 11:40:48 +00001536#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001537PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1538 Py_UNICODE *s, /* Unicode buffer */
1539 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1540 );
Georg Brandlb5503082010-12-05 11:40:48 +00001541#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1544 as argument instead of a raw buffer and length. This function additionally
1545 transforms spaces to ASCII because this is what the callers in longobject,
1546 floatobject, and complexobject did anyways. */
1547
1548#ifndef Py_LIMITED_API
1549PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1550 PyObject *unicode /* Unicode object */
1551 );
1552#endif
1553
Martin v. Löwis011e8422009-05-05 04:43:17 +00001554/* --- File system encoding ---------------------------------------------- */
1555
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001556/* ParseTuple converter: encode str objects to bytes using
1557 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001558
1559PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1560
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001561/* ParseTuple converter: decode bytes objects to unicode using
1562 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1563
1564PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1565
Victor Stinner77c38622010-05-14 15:58:55 +00001566/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1567 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001568
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001569 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1570 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001571
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001572 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001573*/
1574
1575PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1576 const char *s /* encoded string */
1577 );
1578
Victor Stinner77c38622010-05-14 15:58:55 +00001579/* Decode a string using Py_FileSystemDefaultEncoding
1580 and the "surrogateescape" error handler.
1581
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001582 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1583 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001584*/
1585
Martin v. Löwis011e8422009-05-05 04:43:17 +00001586PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1587 const char *s, /* encoded string */
1588 Py_ssize_t size /* size */
1589 );
1590
Victor Stinnerae6265f2010-05-15 16:27:27 +00001591/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001592 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001593
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001594 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1595 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001596*/
1597
1598PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1599 PyObject *unicode
1600 );
1601
Guido van Rossumd8225182000-03-10 22:33:05 +00001602/* --- Methods & Slots ----------------------------------------------------
1603
1604 These are capable of handling Unicode objects and strings on input
1605 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001606 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001607
1608/* Concat two strings giving a new Unicode string. */
1609
Mark Hammond91a681d2002-08-12 07:21:58 +00001610PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611 PyObject *left, /* Left string */
1612 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001613 );
1614
Walter Dörwald1ab83302007-05-18 17:15:44 +00001615/* Concat two strings and put the result in *pleft
1616 (sets *pleft to NULL on error) */
1617
1618PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001619 PyObject **pleft, /* Pointer to left string */
1620 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001621 );
1622
1623/* Concat two strings, put the result in *pleft and drop the right object
1624 (sets *pleft to NULL on error) */
1625
1626PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 PyObject **pleft, /* Pointer to left string */
1628 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001629 );
1630
Guido van Rossumd8225182000-03-10 22:33:05 +00001631/* Split a string giving a list of Unicode strings.
1632
1633 If sep is NULL, splitting will be done at all whitespace
1634 substrings. Otherwise, splits occur at the given separator.
1635
1636 At most maxsplit splits will be done. If negative, no limit is set.
1637
1638 Separators are not included in the resulting list.
1639
1640*/
1641
Mark Hammond91a681d2002-08-12 07:21:58 +00001642PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 PyObject *s, /* String to split */
1644 PyObject *sep, /* String separator */
1645 Py_ssize_t maxsplit /* Maxsplit count */
1646 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001647
1648/* Dito, but split at line breaks.
1649
1650 CRLF is considered to be one line break. Line breaks are not
1651 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001652
Mark Hammond91a681d2002-08-12 07:21:58 +00001653PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654 PyObject *s, /* String to split */
1655 int keepends /* If true, line end markers are included */
1656 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001657
Thomas Wouters477c8d52006-05-27 19:21:47 +00001658/* Partition a string using a given separator. */
1659
1660PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001661 PyObject *s, /* String to partition */
1662 PyObject *sep /* String separator */
1663 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001664
1665/* Partition a string using a given separator, searching from the end of the
1666 string. */
1667
1668PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 PyObject *s, /* String to partition */
1670 PyObject *sep /* String separator */
1671 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001672
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001673/* Split a string giving a list of Unicode strings.
1674
1675 If sep is NULL, splitting will be done at all whitespace
1676 substrings. Otherwise, splits occur at the given separator.
1677
1678 At most maxsplit splits will be done. But unlike PyUnicode_Split
1679 PyUnicode_RSplit splits from the end of the string. If negative,
1680 no limit is set.
1681
1682 Separators are not included in the resulting list.
1683
1684*/
1685
1686PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 PyObject *s, /* String to split */
1688 PyObject *sep, /* String separator */
1689 Py_ssize_t maxsplit /* Maxsplit count */
1690 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001691
Guido van Rossumd8225182000-03-10 22:33:05 +00001692/* Translate a string by applying a character mapping table to it and
1693 return the resulting Unicode object.
1694
1695 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001697
1698 Mapping tables may be dictionaries or sequences. Unmapped character
1699 ordinals (ones which cause a LookupError) are left untouched and
1700 are copied as-is.
1701
1702*/
1703
Mark Hammond91a681d2002-08-12 07:21:58 +00001704PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 PyObject *str, /* String */
1706 PyObject *table, /* Translate table */
1707 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001708 );
1709
1710/* Join a sequence of strings using the given separator and return
1711 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001712
Mark Hammond91a681d2002-08-12 07:21:58 +00001713PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 PyObject *separator, /* Separator string */
1715 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001716 );
1717
1718/* Return 1 if substr matches str[start:end] at the given tail end, 0
1719 otherwise. */
1720
Martin v. Löwis18e16552006-02-15 17:27:45 +00001721PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 PyObject *str, /* String */
1723 PyObject *substr, /* Prefix or Suffix string */
1724 Py_ssize_t start, /* Start index */
1725 Py_ssize_t end, /* Stop index */
1726 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001727 );
1728
1729/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001730 given search direction or -1 if not found. -2 is returned in case
1731 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001732
Martin v. Löwis18e16552006-02-15 17:27:45 +00001733PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001734 PyObject *str, /* String */
1735 PyObject *substr, /* Substring to find */
1736 Py_ssize_t start, /* Start index */
1737 Py_ssize_t end, /* Stop index */
1738 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001739 );
1740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741/* Like PyUnicode_Find, but search for single character only. */
1742PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1743 PyObject *str,
1744 Py_UCS4 ch,
1745 Py_ssize_t start,
1746 Py_ssize_t end,
1747 int direction
1748 );
1749
Barry Warsaw51ac5802000-03-20 16:36:48 +00001750/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001751
Martin v. Löwis18e16552006-02-15 17:27:45 +00001752PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 PyObject *str, /* String */
1754 PyObject *substr, /* Substring to count */
1755 Py_ssize_t start, /* Start index */
1756 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001757 );
1758
Barry Warsaw51ac5802000-03-20 16:36:48 +00001759/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001760 and return the resulting Unicode object. */
1761
Mark Hammond91a681d2002-08-12 07:21:58 +00001762PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001763 PyObject *str, /* String */
1764 PyObject *substr, /* Substring to find */
1765 PyObject *replstr, /* Substring to replace */
1766 Py_ssize_t maxcount /* Max. number of replacements to apply;
1767 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001768 );
1769
1770/* Compare two strings and return -1, 0, 1 for less than, equal,
1771 greater than resp. */
1772
Mark Hammond91a681d2002-08-12 07:21:58 +00001773PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 PyObject *left, /* Left string */
1775 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001776 );
1777
Martin v. Löwis5b222132007-06-10 09:51:05 +00001778PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1779 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001780 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001781 );
1782
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001783/* Rich compare two strings and return one of the following:
1784
1785 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001786 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001787 - Py_NotImplemented in case the type combination is unknown
1788
1789 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1790 case the conversion of the arguments to Unicode fails with a
1791 UnicodeDecodeError.
1792
1793 Possible values for op:
1794
1795 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1796
1797*/
1798
1799PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001800 PyObject *left, /* Left string */
1801 PyObject *right, /* Right string */
1802 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001803 );
1804
Thomas Wouters7e474022000-07-16 12:04:32 +00001805/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001806 the resulting Unicode string. */
1807
Mark Hammond91a681d2002-08-12 07:21:58 +00001808PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001809 PyObject *format, /* Format string */
1810 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001811 );
1812
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001813/* Checks whether element is contained in container and return 1/0
1814 accordingly.
1815
1816 element has to coerce to an one element Unicode string. -1 is
1817 returned in case of an error. */
1818
Mark Hammond91a681d2002-08-12 07:21:58 +00001819PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001820 PyObject *container, /* Container string */
1821 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001822 );
1823
Martin v. Löwis47383402007-08-15 07:32:56 +00001824/* Checks whether argument is a valid identifier. */
1825
1826PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1827
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001828#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001829/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001830PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001831 PyUnicodeObject *self,
1832 int striptype,
1833 PyObject *sepobj
1834 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001835#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001836
Eric Smith5807c412008-05-11 21:00:57 +00001837/* Using the current locale, insert the thousands grouping
1838 into the string pointed to by buffer. For the argument descriptions,
1839 see Objects/stringlib/localeutil.h */
1840
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001841#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001842PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1843 Py_ssize_t n_buffer,
1844 Py_UNICODE *digits,
1845 Py_ssize_t n_digits,
1846 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001847#endif
Eric Smith5807c412008-05-11 21:00:57 +00001848
Eric Smitha3b1ac82009-04-03 14:45:06 +00001849/* Using explicit passed-in values, insert the thousands grouping
1850 into the string pointed to by buffer. For the argument descriptions,
1851 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001852#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001854 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 int kind,
1856 void *buffer,
1857 Py_ssize_t n_buffer,
1858 void *digits,
1859 Py_ssize_t n_digits,
1860 Py_ssize_t min_width,
1861 const char *grouping,
1862 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001863#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001864/* === Characters Type APIs =============================================== */
1865
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001866/* Helper array used by Py_UNICODE_ISSPACE(). */
1867
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001868#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001869PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1870
Guido van Rossumd8225182000-03-10 22:33:05 +00001871/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001872 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001873
1874 These APIs are implemented in Objects/unicodectype.c.
1875
1876*/
1877
Mark Hammond91a681d2002-08-12 07:21:58 +00001878PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001879 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001880 );
1881
Mark Hammond91a681d2002-08-12 07:21:58 +00001882PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001883 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001884 );
1885
Mark Hammond91a681d2002-08-12 07:21:58 +00001886PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001887 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001888 );
1889
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001890PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001891 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001892 );
1893
1894PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001895 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001896 );
1897
Mark Hammond91a681d2002-08-12 07:21:58 +00001898PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001899 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001900 );
1901
Mark Hammond91a681d2002-08-12 07:21:58 +00001902PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001903 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001904 );
1905
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001906PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1907 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001908 );
1909
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001910PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1911 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001912 );
1913
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001914PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1915 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001916 );
1917
Mark Hammond91a681d2002-08-12 07:21:58 +00001918PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001919 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001920 );
1921
Mark Hammond91a681d2002-08-12 07:21:58 +00001922PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001923 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001924 );
1925
Mark Hammond91a681d2002-08-12 07:21:58 +00001926PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001928 );
1929
Mark Hammond91a681d2002-08-12 07:21:58 +00001930PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001931 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
Mark Hammond91a681d2002-08-12 07:21:58 +00001934PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001935 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Mark Hammond91a681d2002-08-12 07:21:58 +00001938PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001939 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Georg Brandl559e5d72008-06-11 18:37:52 +00001942PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001943 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001944 );
1945
Mark Hammond91a681d2002-08-12 07:21:58 +00001946PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001947 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001948 );
1949
Victor Stinneref8d95c2010-08-16 22:03:11 +00001950PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1951 const Py_UNICODE *u
1952 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001953
1954PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001955 Py_UNICODE *s1,
1956 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001957
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001958PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1959 Py_UNICODE *s1, const Py_UNICODE *s2);
1960
Martin v. Löwis5b222132007-06-10 09:51:05 +00001961PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001962 Py_UNICODE *s1,
1963 const Py_UNICODE *s2,
1964 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001965
1966PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001967 const Py_UNICODE *s1,
1968 const Py_UNICODE *s2
1969 );
1970
1971PyAPI_FUNC(int) Py_UNICODE_strncmp(
1972 const Py_UNICODE *s1,
1973 const Py_UNICODE *s2,
1974 size_t n
1975 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001976
1977PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001978 const Py_UNICODE *s,
1979 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001980 );
1981
Victor Stinner331ea922010-08-10 16:37:20 +00001982PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001983 const Py_UNICODE *s,
1984 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001985 );
1986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987PyAPI_FUNC(size_t) Py_UCS4_strlen(
1988 const Py_UCS4 *u
1989 );
1990
1991PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1992 Py_UCS4 *s1,
1993 const Py_UCS4 *s2);
1994
1995PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1996 Py_UCS4 *s1, const Py_UCS4 *s2);
1997
1998PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1999 Py_UCS4 *s1,
2000 const Py_UCS4 *s2,
2001 size_t n);
2002
2003PyAPI_FUNC(int) Py_UCS4_strcmp(
2004 const Py_UCS4 *s1,
2005 const Py_UCS4 *s2
2006 );
2007
2008PyAPI_FUNC(int) Py_UCS4_strncmp(
2009 const Py_UCS4 *s1,
2010 const Py_UCS4 *s2,
2011 size_t n
2012 );
2013
2014PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
2015 const Py_UCS4 *s,
2016 Py_UCS4 c
2017 );
2018
2019PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2020 const Py_UCS4 *s,
2021 Py_UCS4 c
2022 );
2023
Victor Stinner71133ff2010-09-01 23:43:53 +00002024/* Create a copy of a unicode string ending with a nul character. Return NULL
2025 and raise a MemoryError exception on memory allocation failure, otherwise
2026 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2027
Victor Stinner46408602010-09-03 16:18:00 +00002028PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002029 PyObject *unicode
2030 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002031#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002032
Guido van Rossumd8225182000-03-10 22:33:05 +00002033#ifdef __cplusplus
2034}
2035#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002036#endif /* !Py_UNICODEOBJECT_H */