blob: 3c691c1c1c104d544749d1596cd92016a2e0fdee [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200291 - PyUnicode_WCHAR_KIND (0):
292
293 * character type = wchar_t (16 or 32 bits, depending on the
294 platform)
295
296 - PyUnicode_1BYTE_KIND (1):
297
298 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200299 * if ascii is set, all characters must be in range
300 U+0000-U+007F, otherwise at least one character must be in range
301 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302
303 - PyUnicode_2BYTE_KIND (2):
304
305 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200306 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200307
308 - PyUnicode_4BYTE_KIND (3):
309
310 * character type = Py_UCS4 (32 bits, unsigned)
311 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 */
313 unsigned int kind:2;
314 /* Compact is with respect to the allocation scheme. Compact unicode
315 objects only require one memory block while non-compact objects use
316 one block for the PyUnicodeObject struct and another for its data
317 buffer. */
318 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200319 /* The string only contains characters in range U+0000-U+007F (ASCII)
320 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
321 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 unsigned int ascii:1;
323 /* The ready flag indicates whether the object layout is initialized
324 completely. This means that this is either a compact object, or
325 the data pointer is filled out. The bit is redundant, and helps
326 to minimize the test in PyUnicode_IS_READY(). */
327 unsigned int ready:1;
328 } state;
329 wchar_t *wstr; /* wchar_t representation (null-terminated) */
330} PyASCIIObject;
331
332/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200333 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 immediately follow the structure. */
335typedef struct {
336 PyASCIIObject _base;
337 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
338 * terminating \0. */
339 char *utf8; /* UTF-8 representation (null-terminated) */
340 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
341 * surrogates count as two code points. */
342} PyCompactUnicodeObject;
343
344/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
345 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200346 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347typedef struct {
348 PyCompactUnicodeObject _base;
349 union {
350 void *any;
351 Py_UCS1 *latin1;
352 Py_UCS2 *ucs2;
353 Py_UCS4 *ucs4;
354 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000355} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000356#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000357
Mark Hammond91a681d2002-08-12 07:21:58 +0000358PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000359PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000360
Thomas Wouters27d517b2007-02-25 20:39:11 +0000361#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000362 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
363#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
365/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000366#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367
368#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200370 ((PyASCIIObject*)op)->length : \
371 ((PyCompactUnicodeObject*)op)->wstr_length)
372
373/* Returns the deprecated Py_UNICODE representation's size in code units
374 (this includes surrogate pairs as 2 units).
375 If the Py_UNICODE representation is not available, it will be computed
376 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
377
Guido van Rossumd8225182000-03-10 22:33:05 +0000378#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200379 (assert(PyUnicode_Check(op)), \
380 (((PyASCIIObject *)(op))->wstr) ? \
381 PyUnicode_WSTR_LENGTH(op) : \
382 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
383 PyUnicode_WSTR_LENGTH(op)))
384
Guido van Rossumd8225182000-03-10 22:33:05 +0000385#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
387
388/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
389 representation on demand. Using this macro is very inefficient now,
390 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
391 use PyUnicode_WRITE() and PyUnicode_READ(). */
392
Guido van Rossumd8225182000-03-10 22:33:05 +0000393#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200394 (assert(PyUnicode_Check(op)), \
395 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
396 PyUnicode_AsUnicode((PyObject *)(op)))
397
Guido van Rossumd8225182000-03-10 22:33:05 +0000398#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399 ((const char *)(PyUnicode_AS_UNICODE(op)))
400
401
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200402/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
404/* Values for PyUnicodeObject.state: */
405
406/* Interning state. */
407#define SSTATE_NOT_INTERNED 0
408#define SSTATE_INTERNED_MORTAL 1
409#define SSTATE_INTERNED_IMMORTAL 2
410
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411/* Return true if the string contains only ASCII characters, or 0 if not. The
412 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
413 or Ready calls are performed. */
414#define PyUnicode_IS_ASCII(op) \
415 (((PyASCIIObject*)op)->state.ascii)
416
417/* Return true if the string is compact or 0 if not.
418 No type checks or Ready calls are performed. */
419#define PyUnicode_IS_COMPACT(op) \
420 (((PyASCIIObject*)(op))->state.compact)
421
422/* Return true if the string is a compact ASCII string (use PyASCIIObject
423 structure), or 0 if not. No type checks or Ready calls are performed. */
424#define PyUnicode_IS_COMPACT_ASCII(op) \
425 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200426
427/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200428 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200429 has not been called yet. */
430#define PyUnicode_WCHAR_KIND 0
431
432/* Return values of the PyUnicode_KIND() macro: */
433
434#define PyUnicode_1BYTE_KIND 1
435#define PyUnicode_2BYTE_KIND 2
436#define PyUnicode_4BYTE_KIND 3
437
438
439/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200440 this can be 1, 2 or 4.
441
442 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443#define PyUnicode_CHARACTER_SIZE(op) \
444 (1 << (PyUnicode_KIND(op) - 1))
445
Georg Brandl4975a9b2011-10-05 16:12:21 +0200446/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 Py_UCS2, or Py_UCS4 for direct character access.
448 No checks are performed, use PyUnicode_CHARACTER_SIZE or
449 PyUnicode_KIND() before to ensure these will work correctly. */
450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define PyUnicode_KIND(op) \
457 (assert(PyUnicode_Check(op)), \
458 assert(PyUnicode_IS_READY(op)), \
459 ((PyASCIIObject *)(op))->state.kind)
460
Victor Stinner157f83f2011-09-28 21:41:31 +0200461/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462#define _PyUnicode_COMPACT_DATA(op) \
463 (PyUnicode_IS_COMPACT_ASCII(op) ? \
464 ((void*)((PyASCIIObject*)(op) + 1)) : \
465 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op) \
468 (assert(((PyUnicodeObject*)(op))->data.any), \
469 ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472 (assert(PyUnicode_Check(op)), \
473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
474 _PyUnicode_NONCOMPACT_DATA(op))
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200477 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478
Victor Stinner4584a5b2011-10-01 02:39:37 +0200479 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
481
482/* In the access macros below, "kind" may be evaluated more than once.
483 All other macro parameters are evaluated exactly once, so it is safe
484 to put side effects into them (such as increasing the index). */
485
486/* Write into the canonical representation, this macro does not do any sanity
487 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200488 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200489 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200490 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200491#define PyUnicode_WRITE(kind, data, index, value) \
492 do { \
493 switch ((kind)) { \
494 case PyUnicode_1BYTE_KIND: { \
495 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
496 break; \
497 } \
498 case PyUnicode_2BYTE_KIND: { \
499 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
500 break; \
501 } \
502 default: { \
503 assert((kind) == PyUnicode_4BYTE_KIND); \
504 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
505 } \
506 } \
507 } while (0)
508
Georg Brandl07de3252011-10-05 16:47:38 +0200509/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200510 or ready calls are performed. */
511#define PyUnicode_READ(kind, data, index) \
512 ((Py_UCS4) \
513 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200514 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515 ((kind) == PyUnicode_2BYTE_KIND ? \
516 ((const Py_UCS2 *)(data))[(index)] : \
517 ((const Py_UCS4 *)(data))[(index)] \
518 ) \
519 ))
520
521/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
522 calls PyUnicode_KIND() and might call it twice. For single reads, use
523 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
524 cache kind and use PyUnicode_READ instead. */
525#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200526 (assert(PyUnicode_Check(unicode)), \
527 assert(PyUnicode_IS_READY(unicode)), \
528 (Py_UCS4) \
529 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
530 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
531 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
532 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
533 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
534 ) \
535 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200536
537/* Returns the length of the unicode string. The caller has to make sure that
538 the string has it's canonical representation set before calling
539 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
540#define PyUnicode_GET_LENGTH(op) \
541 (assert(PyUnicode_Check(op)), \
542 assert(PyUnicode_IS_READY(op)), \
543 ((PyASCIIObject *)(op))->length)
544
545
546/* Fast check to determine whether an object is ready. Equivalent to
547 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
548
549#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
550
Victor Stinnera3b334d2011-10-03 13:53:37 +0200551/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200553 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200554 Returns 0 on success and -1 on errors. */
555#define PyUnicode_READY(op) \
556 (assert(PyUnicode_Check(op)), \
557 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200558 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560/* Return a maximum character value which is suitable for creating another
561 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200562 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200563#define PyUnicode_MAX_CHAR_VALUE(op) \
564 (assert(PyUnicode_IS_READY(op)), \
565 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
566 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
567 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
568 (0x7fU) : (0xffU) \
569 ) : \
570 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
571 (0xffffU) : (0x10ffffU) \
572 ))))
573
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000574#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000575
576/* --- Constants ---------------------------------------------------------- */
577
578/* This Unicode character will be used as replacement character during
579 decoding if the errors argument is set to "replace". Note: the
580 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
581 Unicode 3.0. */
582
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200583#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000584
585/* === Public API ========================================================= */
586
587/* --- Plain Py_UNICODE --------------------------------------------------- */
588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589/* With PEP 393, this is the recommended way to allocate a new unicode object.
590 This function will allocate the object and its buffer in a single memory
591 block. Objects created using this function are not resizable. */
592#ifndef Py_LIMITED_API
593PyAPI_FUNC(PyObject*) PyUnicode_New(
594 Py_ssize_t size, /* Number of code points in the new string */
595 Py_UCS4 maxchar /* maximum code point value in the string */
596 );
597#endif
598
Victor Stinnerd8f65102011-09-29 19:43:17 +0200599/* Initializes the canonical string representation from a the deprecated
600 wstr/Py_UNICODE representation. This function is used to convert Unicode
601 objects which were created using the old API to the new flexible format
602 introduced with PEP 393.
603
604 Don't call this function directly, use the public PyUnicode_READY() macro
605 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606#ifndef Py_LIMITED_API
607PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200608 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609 );
610#endif
611
Victor Stinner034f6cf2011-09-30 02:26:44 +0200612/* Get a copy of a Unicode string. */
613PyAPI_FUNC(PyObject*) PyUnicode_Copy(
614 PyObject *unicode
615 );
616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618 character conversion when necessary and falls back to memcpy if possible.
619
Victor Stinnera0702ab2011-09-29 14:14:38 +0200620 Fail if to is too small (smaller than how_many or smaller than
621 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
622 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200623
624 Return the number of written character, or return -1 and raise an exception
625 on error.
626
627 Pseudo-code:
628
629 how_many = min(how_many, len(from) - from_start)
630 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
631 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200632
633 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200634 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200636PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637 PyObject *to,
638 Py_ssize_t to_start,
639 PyObject *from,
640 Py_ssize_t from_start,
641 Py_ssize_t how_many
642 );
643#endif
644
Guido van Rossumd8225182000-03-10 22:33:05 +0000645/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000647
648 u may be NULL which causes the contents to be undefined. It is the
649 user's responsibility to fill in the needed data afterwards. Note
650 that modifying the Unicode object contents after construction is
651 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000652
653 The buffer is copied into the new object. */
654
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000655#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000656PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000657 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000658 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000659 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000660#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000661
Georg Brandl952867a2010-06-27 10:17:12 +0000662/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000663PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000664 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000665 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666 );
667
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000668/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000670PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000671 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000672 );
673
Victor Stinnerb9275c12011-10-05 14:01:42 +0200674/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
675 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676#ifndef Py_LIMITED_API
677PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
678 int kind,
679 const void *buffer,
680 Py_ssize_t size);
681#endif
682
683PyAPI_FUNC(PyObject*) PyUnicode_Substring(
684 PyObject *str,
685 Py_ssize_t start,
686 Py_ssize_t end);
687
688/* Copy the string into a UCS4 buffer including the null character is copy_null
689 is set. Return NULL and raise an exception on error. Raise a ValueError if
690 the buffer is smaller than the string. Return buffer on success.
691
692 buflen is the length of the buffer in (Py_UCS4) characters. */
693PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
694 PyObject *unicode,
695 Py_UCS4* buffer,
696 Py_ssize_t buflen,
697 int copy_null);
698
699/* Copy the string into a UCS4 buffer. A new buffer is allocated using
700 * PyMem_Malloc; if this fails, NULL is returned with a memory error
701 exception set. */
702PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
703
Guido van Rossumd8225182000-03-10 22:33:05 +0000704/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 Py_UNICODE buffer.
706 If the wchar_t/Py_UNICODE representation is not yet available, this
707 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000708
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000709#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000710PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000712 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000713#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715/* Return a read-only pointer to the Unicode object's internal
716 Py_UNICODE buffer and save the length at size.
717 If the wchar_t/Py_UNICODE representation is not yet available, this
718 function will calculate it. */
719
720#ifndef Py_LIMITED_API
721PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
722 PyObject *unicode, /* Unicode object */
723 Py_ssize_t *size /* location where to save the length */
724 );
725#endif
726
Guido van Rossumd8225182000-03-10 22:33:05 +0000727/* Get the length of the Unicode object. */
728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
730 PyObject *unicode
731);
732
Victor Stinner157f83f2011-09-28 21:41:31 +0200733/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734 string representation. */
735
Martin v. Löwis18e16552006-02-15 17:27:45 +0000736PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000738 );
739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740/* Read a character from the string. */
741
742PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
743 PyObject *unicode,
744 Py_ssize_t index
745 );
746
747/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200748 PyUnicode_New, must not be shared, and must not have been hashed yet.
749
750 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751
752PyAPI_FUNC(int) PyUnicode_WriteChar(
753 PyObject *unicode,
754 Py_ssize_t index,
755 Py_UCS4 character
756 );
757
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000758#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000759/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000760PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000761#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000762
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200763/* Resize an Unicode object allocated by the legacy API (e.g.
764 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
765 PyUnicode_New) cannot be resized by this function.
766
767 The length is a number of Py_UNICODE characters (and not the number of code
768 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000769
770 *unicode is modified to point to the new (resized) object and 0
771 returned on success.
772
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200773 If the refcount on the object is 1, the function resizes the string in
774 place, which is usually faster than allocating a new string (and copy
775 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000776
777 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200778 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000779
Mark Hammond91a681d2002-08-12 07:21:58 +0000780PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 PyObject **unicode, /* Pointer to the Unicode object */
782 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000783 );
784
Guido van Rossumd8225182000-03-10 22:33:05 +0000785/* Coerce obj to an Unicode object and return a reference with
786 *incremented* refcount.
787
788 Coercion is done in the following way:
789
Georg Brandl952867a2010-06-27 10:17:12 +0000790 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000791 under the assumptions that they contain data using the UTF-8
792 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000793
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000794 2. All other objects (including Unicode objects) raise an
795 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000796
797 The API returns NULL in case of an error. The caller is responsible
798 for decref'ing the returned objects.
799
800*/
801
Mark Hammond91a681d2002-08-12 07:21:58 +0000802PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000804 const char *encoding, /* encoding */
805 const char *errors /* error handling */
806 );
807
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000808/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000809 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000811 Unicode objects are passed back as-is (subclasses are converted to
812 true Unicode objects), all other objects are delegated to
813 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000814 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000815
816 The API returns NULL in case of an error. The caller is responsible
817 for decref'ing the returned objects.
818
819*/
820
Mark Hammond91a681d2002-08-12 07:21:58 +0000821PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000823 );
824
Victor Stinner1205f272010-09-11 00:54:47 +0000825PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
826 const char *format, /* ASCII-encoded string */
827 va_list vargs
828 );
829PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
830 const char *format, /* ASCII-encoded string */
831 ...
832 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000833
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000834#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000835/* Format the object based on the format_spec, as defined in PEP 3101
836 (Advanced String Formatting). */
837PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838 PyObject *format_spec,
839 Py_ssize_t start,
840 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000841#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000842
Walter Dörwald16807132007-05-25 13:52:07 +0000843PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
844PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000845PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
846 const char *u /* UTF-8 encoded string */
847 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000848#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000849PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000850#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000851
852/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853#define PyUnicode_CHECK_INTERNED(op) \
854 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000855
Guido van Rossumd8225182000-03-10 22:33:05 +0000856/* --- wchar_t support for platforms which support it --------------------- */
857
858#ifdef HAVE_WCHAR_H
859
Georg Brandl952867a2010-06-27 10:17:12 +0000860/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000861 size.
862
863 The buffer is copied into the new object. */
864
Mark Hammond91a681d2002-08-12 07:21:58 +0000865PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000866 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000867 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000868 );
869
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000870/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000871 most size wchar_t characters are copied.
872
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000873 Note that the resulting wchar_t string may or may not be
874 0-terminated. It is the responsibility of the caller to make sure
875 that the wchar_t string is 0-terminated in case this is required by
876 the application.
877
878 Returns the number of wchar_t characters copied (excluding a
879 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000880 error. */
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000883 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000884 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000885 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000886 );
887
Victor Stinner137c34c2010-09-29 10:25:54 +0000888/* Convert the Unicode object to a wide character string. The output string
889 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200890 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000891
892 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
893 on success. On error, returns NULL, *size is undefined and raises a
894 MemoryError. */
895
896PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000897 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000898 Py_ssize_t *size /* number of characters of the result */
899 );
900
Victor Stinner9f789e72011-10-01 03:57:28 +0200901#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200903#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904
Guido van Rossumd8225182000-03-10 22:33:05 +0000905#endif
906
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000907/* --- Unicode ordinals --------------------------------------------------- */
908
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000909/* Create a Unicode Object from the given Unicode code point ordinal.
910
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000911 The ordinal must be in range(0x10000) on narrow Python builds
912 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
913 raised in case it is not.
914
915*/
916
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000917PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000919/* --- Free-list management ----------------------------------------------- */
920
921/* Clear the free list used by the Unicode implementation.
922
923 This can be used to release memory used for objects on the free
924 list back to the Python memory allocator.
925
926*/
927
928PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
929
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000931
932 Many of these APIs take two arguments encoding and errors. These
933 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000934 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000935
Georg Brandl952867a2010-06-27 10:17:12 +0000936 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000937
938 Error handling is set by errors which may also be set to NULL
939 meaning to use the default handling defined for the codec. Default
940 error handling for all builtin codecs is "strict" (ValueErrors are
941 raised).
942
943 The codecs all use a similar interface. Only deviation from the
944 generic ones are documented.
945
946*/
947
Fred Drakecb093fe2000-05-09 19:51:53 +0000948/* --- Manage the default encoding ---------------------------------------- */
949
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000950/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000951 Unicode object unicode and the size of the encoded representation
952 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000953
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000954 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000955
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200956 This function caches the UTF-8 encoded string in the unicodeobject
957 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 when the unicodeobject is deallocated.
959
960 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
961 support the previous internal function with the same behaviour.
962
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000963 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000964 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000965
966 *** If you need to access the Unicode object as UTF-8 bytes string,
967 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000968*/
969
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000970#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000972 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000973 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000975#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000976
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000977/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000978 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
981 in the unicodeobject.
982
983 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
984 support the previous internal function with the same behaviour.
985
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000986 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000987 extracted from the returned data.
988
989 *** This API is for interpreter INTERNAL USE ONLY and will likely
990 *** be removed or changed for Python 3.1.
991
992 *** If you need to access the Unicode object as UTF-8 bytes string,
993 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000994
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000995*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000996
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000997#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
999#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001000#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001001
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001002/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001003
Mark Hammond91a681d2002-08-12 07:21:58 +00001004PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001005
Guido van Rossumd8225182000-03-10 22:33:05 +00001006/* --- Generic Codecs ----------------------------------------------------- */
1007
1008/* Create a Unicode object by decoding the encoded string s of the
1009 given size. */
1010
Mark Hammond91a681d2002-08-12 07:21:58 +00001011PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001012 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001013 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001014 const char *encoding, /* encoding */
1015 const char *errors /* error handling */
1016 );
1017
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001018/* Decode a Unicode object unicode and return the result as Python
1019 object. */
1020
1021PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001022 PyObject *unicode, /* Unicode object */
1023 const char *encoding, /* encoding */
1024 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001025 );
1026
1027/* Decode a Unicode object unicode and return the result as Unicode
1028 object. */
1029
1030PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001031 PyObject *unicode, /* Unicode object */
1032 const char *encoding, /* encoding */
1033 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001034 );
1035
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001036/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001037 Python string object. */
1038
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001039#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001040PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001041 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001042 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001043 const char *encoding, /* encoding */
1044 const char *errors /* error handling */
1045 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001046#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001047
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001048/* Encodes a Unicode object and returns the result as Python
1049 object. */
1050
1051PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001052 PyObject *unicode, /* Unicode object */
1053 const char *encoding, /* encoding */
1054 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001055 );
1056
Guido van Rossumd8225182000-03-10 22:33:05 +00001057/* Encodes a Unicode object and returns the result as Python string
1058 object. */
1059
Mark Hammond91a681d2002-08-12 07:21:58 +00001060PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 PyObject *unicode, /* Unicode object */
1062 const char *encoding, /* encoding */
1063 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001064 );
1065
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001066/* Encodes a Unicode object and returns the result as Unicode
1067 object. */
1068
1069PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070 PyObject *unicode, /* Unicode object */
1071 const char *encoding, /* encoding */
1072 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001073 );
1074
1075/* Build an encoding map. */
1076
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001077PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1078 PyObject* string /* 256 character map */
1079 );
1080
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081/* --- UTF-7 Codecs ------------------------------------------------------- */
1082
Mark Hammond91a681d2002-08-12 07:21:58 +00001083PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001084 const char *string, /* UTF-7 encoded string */
1085 Py_ssize_t length, /* size of string */
1086 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 );
1088
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001089PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 const char *string, /* UTF-7 encoded string */
1091 Py_ssize_t length, /* size of string */
1092 const char *errors, /* error handling */
1093 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001094 );
1095
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001096#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001097PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 const Py_UNICODE *data, /* Unicode char buffer */
1099 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1100 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1101 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1102 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001103 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001104#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001105
Guido van Rossumd8225182000-03-10 22:33:05 +00001106/* --- UTF-8 Codecs ------------------------------------------------------- */
1107
Mark Hammond91a681d2002-08-12 07:21:58 +00001108PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 const char *string, /* UTF-8 encoded string */
1110 Py_ssize_t length, /* size of string */
1111 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001112 );
1113
Walter Dörwald69652032004-09-07 20:24:22 +00001114PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 const char *string, /* UTF-8 encoded string */
1116 Py_ssize_t length, /* size of string */
1117 const char *errors, /* error handling */
1118 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001119 );
1120
Mark Hammond91a681d2002-08-12 07:21:58 +00001121PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001123 );
1124
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001125#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1127 PyObject *unicode,
1128 const char *errors);
1129
Mark Hammond91a681d2002-08-12 07:21:58 +00001130PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 const Py_UNICODE *data, /* Unicode char buffer */
1132 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1133 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001134 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001135#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001136
Walter Dörwald41980ca2007-08-16 21:55:45 +00001137/* --- UTF-32 Codecs ------------------------------------------------------ */
1138
1139/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1140 the corresponding Unicode object.
1141
1142 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001144
1145 If byteorder is non-NULL, the decoder starts decoding using the
1146 given byte order:
1147
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 *byteorder == -1: little endian
1149 *byteorder == 0: native order
1150 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001151
1152 In native mode, the first four bytes of the stream are checked for a
1153 BOM mark. If found, the BOM mark is analysed, the byte order
1154 adjusted and the BOM skipped. In the other modes, no BOM mark
1155 interpretation is done. After completion, *byteorder is set to the
1156 current byte order at the end of input data.
1157
1158 If byteorder is NULL, the codec starts in native order mode.
1159
1160*/
1161
1162PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 const char *string, /* UTF-32 encoded string */
1164 Py_ssize_t length, /* size of string */
1165 const char *errors, /* error handling */
1166 int *byteorder /* pointer to byteorder to use
1167 0=native;-1=LE,1=BE; updated on
1168 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001169 );
1170
1171PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 const char *string, /* UTF-32 encoded string */
1173 Py_ssize_t length, /* size of string */
1174 const char *errors, /* error handling */
1175 int *byteorder, /* pointer to byteorder to use
1176 0=native;-1=LE,1=BE; updated on
1177 exit */
1178 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001179 );
1180
1181/* Returns a Python string using the UTF-32 encoding in native byte
1182 order. The string always starts with a BOM mark. */
1183
1184PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001186 );
1187
1188/* Returns a Python string object holding the UTF-32 encoded value of
1189 the Unicode data.
1190
1191 If byteorder is not 0, output is written according to the following
1192 byte order:
1193
1194 byteorder == -1: little endian
1195 byteorder == 0: native byte order (writes a BOM mark)
1196 byteorder == 1: big endian
1197
1198 If byteorder is 0, the output string will always start with the
1199 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1200 prepended.
1201
1202*/
1203
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001204#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001205PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 const Py_UNICODE *data, /* Unicode char buffer */
1207 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1208 const char *errors, /* error handling */
1209 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001210 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001211#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001212
Guido van Rossumd8225182000-03-10 22:33:05 +00001213/* --- UTF-16 Codecs ------------------------------------------------------ */
1214
Guido van Rossum9e896b32000-04-05 20:11:21 +00001215/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001216 the corresponding Unicode object.
1217
1218 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001220
1221 If byteorder is non-NULL, the decoder starts decoding using the
1222 given byte order:
1223
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 *byteorder == -1: little endian
1225 *byteorder == 0: native order
1226 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001227
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001228 In native mode, the first two bytes of the stream are checked for a
1229 BOM mark. If found, the BOM mark is analysed, the byte order
1230 adjusted and the BOM skipped. In the other modes, no BOM mark
1231 interpretation is done. After completion, *byteorder is set to the
1232 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001233
1234 If byteorder is NULL, the codec starts in native order mode.
1235
1236*/
1237
Mark Hammond91a681d2002-08-12 07:21:58 +00001238PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 const char *string, /* UTF-16 encoded string */
1240 Py_ssize_t length, /* size of string */
1241 const char *errors, /* error handling */
1242 int *byteorder /* pointer to byteorder to use
1243 0=native;-1=LE,1=BE; updated on
1244 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001245 );
1246
Walter Dörwald69652032004-09-07 20:24:22 +00001247PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 const char *string, /* UTF-16 encoded string */
1249 Py_ssize_t length, /* size of string */
1250 const char *errors, /* error handling */
1251 int *byteorder, /* pointer to byteorder to use
1252 0=native;-1=LE,1=BE; updated on
1253 exit */
1254 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001255 );
1256
Guido van Rossumd8225182000-03-10 22:33:05 +00001257/* Returns a Python string using the UTF-16 encoding in native byte
1258 order. The string always starts with a BOM mark. */
1259
Mark Hammond91a681d2002-08-12 07:21:58 +00001260PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001261 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001262 );
1263
1264/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001265 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001266
1267 If byteorder is not 0, output is written according to the following
1268 byte order:
1269
1270 byteorder == -1: little endian
1271 byteorder == 0: native byte order (writes a BOM mark)
1272 byteorder == 1: big endian
1273
1274 If byteorder is 0, the output string will always start with the
1275 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1276 prepended.
1277
1278 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1279 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001280 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001281
1282*/
1283
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001284#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001285PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 const Py_UNICODE *data, /* Unicode char buffer */
1287 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1288 const char *errors, /* error handling */
1289 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001290 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001291#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001292
1293/* --- Unicode-Escape Codecs ---------------------------------------------- */
1294
Mark Hammond91a681d2002-08-12 07:21:58 +00001295PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 const char *string, /* Unicode-Escape encoded string */
1297 Py_ssize_t length, /* size of string */
1298 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001299 );
1300
Mark Hammond91a681d2002-08-12 07:21:58 +00001301PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001303 );
1304
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001306PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 const Py_UNICODE *data, /* Unicode char buffer */
1308 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001310#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001311
1312/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1313
Mark Hammond91a681d2002-08-12 07:21:58 +00001314PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 const char *string, /* Raw-Unicode-Escape encoded string */
1316 Py_ssize_t length, /* size of string */
1317 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001318 );
1319
Mark Hammond91a681d2002-08-12 07:21:58 +00001320PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001322 );
1323
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001324#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001325PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326 const Py_UNICODE *data, /* Unicode char buffer */
1327 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001328 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001329#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001330
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001331/* --- Unicode Internal Codec ---------------------------------------------
1332
1333 Only for internal use in _codecsmodule.c */
1334
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001335#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001336PyObject *_PyUnicode_DecodeUnicodeInternal(
1337 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001338 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001339 const char *errors
1340 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001341#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001342
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001344
1345 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1346
1347*/
1348
Mark Hammond91a681d2002-08-12 07:21:58 +00001349PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 const char *string, /* Latin-1 encoded string */
1351 Py_ssize_t length, /* size of string */
1352 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001353 );
1354
Mark Hammond91a681d2002-08-12 07:21:58 +00001355PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001357 );
1358
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001359#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1361 PyObject* unicode,
1362 const char* errors);
1363
Mark Hammond91a681d2002-08-12 07:21:58 +00001364PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 const Py_UNICODE *data, /* Unicode char buffer */
1366 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1367 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001368 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001369#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001370
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001372
1373 Only 7-bit ASCII data is excepted. All other codes generate errors.
1374
1375*/
1376
Mark Hammond91a681d2002-08-12 07:21:58 +00001377PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 const char *string, /* ASCII encoded string */
1379 Py_ssize_t length, /* size of string */
1380 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001381 );
1382
Mark Hammond91a681d2002-08-12 07:21:58 +00001383PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001385 );
1386
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001387#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1389 PyObject* unicode,
1390 const char* errors);
1391
Mark Hammond91a681d2002-08-12 07:21:58 +00001392PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 const Py_UNICODE *data, /* Unicode char buffer */
1394 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1395 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001396 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001397#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001398
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001402
1403 Decoding mappings must map single string characters to single
1404 Unicode characters, integers (which are then interpreted as Unicode
1405 ordinals) or None (meaning "undefined mapping" and causing an
1406 error).
1407
1408 Encoding mappings must map single Unicode characters to single
1409 string characters, integers (which are then interpreted as Latin-1
1410 ordinals) or None (meaning "undefined mapping" and causing an
1411 error).
1412
1413 If a character lookup fails with a LookupError, the character is
1414 copied as-is meaning that its ordinal value will be interpreted as
1415 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1416 to contain those mappings which map characters to different code
1417 points.
1418
1419*/
1420
Mark Hammond91a681d2002-08-12 07:21:58 +00001421PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 const char *string, /* Encoded string */
1423 Py_ssize_t length, /* size of string */
1424 PyObject *mapping, /* character mapping
1425 (char ordinal -> unicode ordinal) */
1426 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001427 );
1428
Mark Hammond91a681d2002-08-12 07:21:58 +00001429PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 PyObject *unicode, /* Unicode object */
1431 PyObject *mapping /* character mapping
1432 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001433 );
1434
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001435#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001436PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 const Py_UNICODE *data, /* Unicode char buffer */
1438 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1439 PyObject *mapping, /* character mapping
1440 (unicode ordinal -> char ordinal) */
1441 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001442 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001443#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001444
1445/* Translate a Py_UNICODE buffer of the given length by applying a
1446 character mapping table to it and return the resulting Unicode
1447 object.
1448
1449 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001451
1452 Mapping tables may be dictionaries or sequences. Unmapped character
1453 ordinals (ones which cause a LookupError) are left untouched and
1454 are copied as-is.
1455
1456*/
1457
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001458#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001459PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 const Py_UNICODE *data, /* Unicode char buffer */
1461 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1462 PyObject *table, /* Translate table */
1463 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001464 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001465#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001466
Victor Stinner99b95382011-07-04 14:23:54 +02001467#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001468
Guido van Rossumefec1152000-03-28 02:01:15 +00001469/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001470
Mark Hammond91a681d2002-08-12 07:21:58 +00001471PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001472 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001473 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001474 const char *errors /* error handling */
1475 );
1476
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001477PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1478 const char *string, /* MBCS encoded string */
1479 Py_ssize_t length, /* size of string */
1480 const char *errors, /* error handling */
1481 Py_ssize_t *consumed /* bytes consumed */
1482 );
1483
Mark Hammond91a681d2002-08-12 07:21:58 +00001484PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001485 PyObject *unicode /* Unicode object */
1486 );
1487
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001488#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001489PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001490 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001491 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001492 const char *errors /* error handling */
1493 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001494#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001495
Victor Stinner99b95382011-07-04 14:23:54 +02001496#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001497
Guido van Rossum9e896b32000-04-05 20:11:21 +00001498/* --- Decimal Encoder ---------------------------------------------------- */
1499
1500/* Takes a Unicode string holding a decimal value and writes it into
1501 an output buffer using standard ASCII digit codes.
1502
1503 The output buffer has to provide at least length+1 bytes of storage
1504 area. The output string is 0-terminated.
1505
1506 The encoder converts whitespace to ' ', decimal characters to their
1507 corresponding ASCII digit and all other Latin-1 characters except
1508 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1509 are treated as errors. This includes embedded NULL bytes.
1510
1511 Error handling is defined by the errors argument:
1512
1513 NULL or "strict": raise a ValueError
1514 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001516 "replace": replaces illegal characters with '?'
1517
1518 Returns 0 on success, -1 on failure.
1519
1520*/
1521
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001522#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001523PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 Py_UNICODE *s, /* Unicode buffer */
1525 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1526 char *output, /* Output buffer; must have size >= length */
1527 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001528 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001529#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001530
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001531/* Transforms code points that have decimal digit property to the
1532 corresponding ASCII digit code points.
1533
1534 Returns a new Unicode string on success, NULL on failure.
1535*/
1536
Georg Brandlb5503082010-12-05 11:40:48 +00001537#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001538PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1539 Py_UNICODE *s, /* Unicode buffer */
1540 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1541 );
Georg Brandlb5503082010-12-05 11:40:48 +00001542#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1545 as argument instead of a raw buffer and length. This function additionally
1546 transforms spaces to ASCII because this is what the callers in longobject,
1547 floatobject, and complexobject did anyways. */
1548
1549#ifndef Py_LIMITED_API
1550PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1551 PyObject *unicode /* Unicode object */
1552 );
1553#endif
1554
Martin v. Löwis011e8422009-05-05 04:43:17 +00001555/* --- File system encoding ---------------------------------------------- */
1556
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001557/* ParseTuple converter: encode str objects to bytes using
1558 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001559
1560PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1561
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001562/* ParseTuple converter: decode bytes objects to unicode using
1563 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1564
1565PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1566
Victor Stinner77c38622010-05-14 15:58:55 +00001567/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1568 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001569
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001570 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1571 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001572
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001573 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001574*/
1575
1576PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1577 const char *s /* encoded string */
1578 );
1579
Victor Stinner77c38622010-05-14 15:58:55 +00001580/* Decode a string using Py_FileSystemDefaultEncoding
1581 and the "surrogateescape" error handler.
1582
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001583 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1584 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001585*/
1586
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1588 const char *s, /* encoded string */
1589 Py_ssize_t size /* size */
1590 );
1591
Victor Stinnerae6265f2010-05-15 16:27:27 +00001592/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001593 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001594
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001595 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1596 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001597*/
1598
1599PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1600 PyObject *unicode
1601 );
1602
Guido van Rossumd8225182000-03-10 22:33:05 +00001603/* --- Methods & Slots ----------------------------------------------------
1604
1605 These are capable of handling Unicode objects and strings on input
1606 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001607 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001608
1609/* Concat two strings giving a new Unicode string. */
1610
Mark Hammond91a681d2002-08-12 07:21:58 +00001611PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 PyObject *left, /* Left string */
1613 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001614 );
1615
Walter Dörwald1ab83302007-05-18 17:15:44 +00001616/* Concat two strings and put the result in *pleft
1617 (sets *pleft to NULL on error) */
1618
1619PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 PyObject **pleft, /* Pointer to left string */
1621 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001622 );
1623
1624/* Concat two strings, put the result in *pleft and drop the right object
1625 (sets *pleft to NULL on error) */
1626
1627PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 PyObject **pleft, /* Pointer to left string */
1629 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001630 );
1631
Guido van Rossumd8225182000-03-10 22:33:05 +00001632/* Split a string giving a list of Unicode strings.
1633
1634 If sep is NULL, splitting will be done at all whitespace
1635 substrings. Otherwise, splits occur at the given separator.
1636
1637 At most maxsplit splits will be done. If negative, no limit is set.
1638
1639 Separators are not included in the resulting list.
1640
1641*/
1642
Mark Hammond91a681d2002-08-12 07:21:58 +00001643PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 PyObject *s, /* String to split */
1645 PyObject *sep, /* String separator */
1646 Py_ssize_t maxsplit /* Maxsplit count */
1647 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001648
1649/* Dito, but split at line breaks.
1650
1651 CRLF is considered to be one line break. Line breaks are not
1652 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653
Mark Hammond91a681d2002-08-12 07:21:58 +00001654PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 PyObject *s, /* String to split */
1656 int keepends /* If true, line end markers are included */
1657 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001658
Thomas Wouters477c8d52006-05-27 19:21:47 +00001659/* Partition a string using a given separator. */
1660
1661PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 PyObject *s, /* String to partition */
1663 PyObject *sep /* String separator */
1664 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001665
1666/* Partition a string using a given separator, searching from the end of the
1667 string. */
1668
1669PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 PyObject *s, /* String to partition */
1671 PyObject *sep /* String separator */
1672 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001673
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001674/* Split a string giving a list of Unicode strings.
1675
1676 If sep is NULL, splitting will be done at all whitespace
1677 substrings. Otherwise, splits occur at the given separator.
1678
1679 At most maxsplit splits will be done. But unlike PyUnicode_Split
1680 PyUnicode_RSplit splits from the end of the string. If negative,
1681 no limit is set.
1682
1683 Separators are not included in the resulting list.
1684
1685*/
1686
1687PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 PyObject *s, /* String to split */
1689 PyObject *sep, /* String separator */
1690 Py_ssize_t maxsplit /* Maxsplit count */
1691 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001692
Guido van Rossumd8225182000-03-10 22:33:05 +00001693/* Translate a string by applying a character mapping table to it and
1694 return the resulting Unicode object.
1695
1696 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001698
1699 Mapping tables may be dictionaries or sequences. Unmapped character
1700 ordinals (ones which cause a LookupError) are left untouched and
1701 are copied as-is.
1702
1703*/
1704
Mark Hammond91a681d2002-08-12 07:21:58 +00001705PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 PyObject *str, /* String */
1707 PyObject *table, /* Translate table */
1708 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001709 );
1710
1711/* Join a sequence of strings using the given separator and return
1712 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713
Mark Hammond91a681d2002-08-12 07:21:58 +00001714PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 PyObject *separator, /* Separator string */
1716 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001717 );
1718
1719/* Return 1 if substr matches str[start:end] at the given tail end, 0
1720 otherwise. */
1721
Martin v. Löwis18e16552006-02-15 17:27:45 +00001722PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 PyObject *str, /* String */
1724 PyObject *substr, /* Prefix or Suffix string */
1725 Py_ssize_t start, /* Start index */
1726 Py_ssize_t end, /* Stop index */
1727 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001728 );
1729
1730/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001731 given search direction or -1 if not found. -2 is returned in case
1732 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001733
Martin v. Löwis18e16552006-02-15 17:27:45 +00001734PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 PyObject *str, /* String */
1736 PyObject *substr, /* Substring to find */
1737 Py_ssize_t start, /* Start index */
1738 Py_ssize_t end, /* Stop index */
1739 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001740 );
1741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742/* Like PyUnicode_Find, but search for single character only. */
1743PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1744 PyObject *str,
1745 Py_UCS4 ch,
1746 Py_ssize_t start,
1747 Py_ssize_t end,
1748 int direction
1749 );
1750
Barry Warsaw51ac5802000-03-20 16:36:48 +00001751/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001752
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 PyObject *str, /* String */
1755 PyObject *substr, /* Substring to count */
1756 Py_ssize_t start, /* Start index */
1757 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001758 );
1759
Barry Warsaw51ac5802000-03-20 16:36:48 +00001760/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001761 and return the resulting Unicode object. */
1762
Mark Hammond91a681d2002-08-12 07:21:58 +00001763PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001764 PyObject *str, /* String */
1765 PyObject *substr, /* Substring to find */
1766 PyObject *replstr, /* Substring to replace */
1767 Py_ssize_t maxcount /* Max. number of replacements to apply;
1768 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001769 );
1770
1771/* Compare two strings and return -1, 0, 1 for less than, equal,
1772 greater than resp. */
1773
Mark Hammond91a681d2002-08-12 07:21:58 +00001774PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001775 PyObject *left, /* Left string */
1776 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001777 );
1778
Martin v. Löwis5b222132007-06-10 09:51:05 +00001779PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1780 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001781 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001782 );
1783
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001784/* Rich compare two strings and return one of the following:
1785
1786 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001787 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001788 - Py_NotImplemented in case the type combination is unknown
1789
1790 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1791 case the conversion of the arguments to Unicode fails with a
1792 UnicodeDecodeError.
1793
1794 Possible values for op:
1795
1796 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1797
1798*/
1799
1800PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001801 PyObject *left, /* Left string */
1802 PyObject *right, /* Right string */
1803 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001804 );
1805
Thomas Wouters7e474022000-07-16 12:04:32 +00001806/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001807 the resulting Unicode string. */
1808
Mark Hammond91a681d2002-08-12 07:21:58 +00001809PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001810 PyObject *format, /* Format string */
1811 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001812 );
1813
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001814/* Checks whether element is contained in container and return 1/0
1815 accordingly.
1816
1817 element has to coerce to an one element Unicode string. -1 is
1818 returned in case of an error. */
1819
Mark Hammond91a681d2002-08-12 07:21:58 +00001820PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001821 PyObject *container, /* Container string */
1822 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001823 );
1824
Martin v. Löwis47383402007-08-15 07:32:56 +00001825/* Checks whether argument is a valid identifier. */
1826
1827PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1828
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001829#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001830/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001831PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001832 PyUnicodeObject *self,
1833 int striptype,
1834 PyObject *sepobj
1835 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001836#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001837
Eric Smith5807c412008-05-11 21:00:57 +00001838/* Using the current locale, insert the thousands grouping
1839 into the string pointed to by buffer. For the argument descriptions,
1840 see Objects/stringlib/localeutil.h */
1841
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001842#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001843PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1844 Py_ssize_t n_buffer,
1845 Py_UNICODE *digits,
1846 Py_ssize_t n_digits,
1847 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001848#endif
Eric Smith5807c412008-05-11 21:00:57 +00001849
Eric Smitha3b1ac82009-04-03 14:45:06 +00001850/* Using explicit passed-in values, insert the thousands grouping
1851 into the string pointed to by buffer. For the argument descriptions,
1852 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001853#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001855 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 int kind,
1857 void *buffer,
1858 Py_ssize_t n_buffer,
1859 void *digits,
1860 Py_ssize_t n_digits,
1861 Py_ssize_t min_width,
1862 const char *grouping,
1863 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001864#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001865/* === Characters Type APIs =============================================== */
1866
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001867/* Helper array used by Py_UNICODE_ISSPACE(). */
1868
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001869#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001870PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1871
Guido van Rossumd8225182000-03-10 22:33:05 +00001872/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001873 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001874
1875 These APIs are implemented in Objects/unicodectype.c.
1876
1877*/
1878
Mark Hammond91a681d2002-08-12 07:21:58 +00001879PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001880 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001881 );
1882
Mark Hammond91a681d2002-08-12 07:21:58 +00001883PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001884 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001885 );
1886
Mark Hammond91a681d2002-08-12 07:21:58 +00001887PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001888 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001889 );
1890
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001891PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001892 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001893 );
1894
1895PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001896 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001897 );
1898
Mark Hammond91a681d2002-08-12 07:21:58 +00001899PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001900 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001901 );
1902
Mark Hammond91a681d2002-08-12 07:21:58 +00001903PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001904 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001905 );
1906
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001907PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1908 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001909 );
1910
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001911PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1912 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001913 );
1914
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001915PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1916 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001917 );
1918
Mark Hammond91a681d2002-08-12 07:21:58 +00001919PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001920 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001921 );
1922
Mark Hammond91a681d2002-08-12 07:21:58 +00001923PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001924 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001925 );
1926
Mark Hammond91a681d2002-08-12 07:21:58 +00001927PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001928 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001929 );
1930
Mark Hammond91a681d2002-08-12 07:21:58 +00001931PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001932 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001933 );
1934
Mark Hammond91a681d2002-08-12 07:21:58 +00001935PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001936 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001937 );
1938
Mark Hammond91a681d2002-08-12 07:21:58 +00001939PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001940 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001941 );
1942
Georg Brandl559e5d72008-06-11 18:37:52 +00001943PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001944 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001945 );
1946
Mark Hammond91a681d2002-08-12 07:21:58 +00001947PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001948 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001949 );
1950
Victor Stinneref8d95c2010-08-16 22:03:11 +00001951PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1952 const Py_UNICODE *u
1953 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001954
1955PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001956 Py_UNICODE *s1,
1957 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001958
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001959PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1960 Py_UNICODE *s1, const Py_UNICODE *s2);
1961
Martin v. Löwis5b222132007-06-10 09:51:05 +00001962PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001963 Py_UNICODE *s1,
1964 const Py_UNICODE *s2,
1965 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001966
1967PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001968 const Py_UNICODE *s1,
1969 const Py_UNICODE *s2
1970 );
1971
1972PyAPI_FUNC(int) Py_UNICODE_strncmp(
1973 const Py_UNICODE *s1,
1974 const Py_UNICODE *s2,
1975 size_t n
1976 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977
1978PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001979 const Py_UNICODE *s,
1980 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001981 );
1982
Victor Stinner331ea922010-08-10 16:37:20 +00001983PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001984 const Py_UNICODE *s,
1985 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001986 );
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988PyAPI_FUNC(size_t) Py_UCS4_strlen(
1989 const Py_UCS4 *u
1990 );
1991
1992PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1993 Py_UCS4 *s1,
1994 const Py_UCS4 *s2);
1995
1996PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1997 Py_UCS4 *s1, const Py_UCS4 *s2);
1998
1999PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
2000 Py_UCS4 *s1,
2001 const Py_UCS4 *s2,
2002 size_t n);
2003
2004PyAPI_FUNC(int) Py_UCS4_strcmp(
2005 const Py_UCS4 *s1,
2006 const Py_UCS4 *s2
2007 );
2008
2009PyAPI_FUNC(int) Py_UCS4_strncmp(
2010 const Py_UCS4 *s1,
2011 const Py_UCS4 *s2,
2012 size_t n
2013 );
2014
2015PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
2016 const Py_UCS4 *s,
2017 Py_UCS4 c
2018 );
2019
2020PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2021 const Py_UCS4 *s,
2022 Py_UCS4 c
2023 );
2024
Victor Stinner71133ff2010-09-01 23:43:53 +00002025/* Create a copy of a unicode string ending with a nul character. Return NULL
2026 and raise a MemoryError exception on memory allocation failure, otherwise
2027 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2028
Victor Stinner46408602010-09-03 16:18:00 +00002029PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002030 PyObject *unicode
2031 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002032#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002033
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002034#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2035/* FIXME: use PyObject* type for op */
2036PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2037 void *op,
2038 int check_content);
2039#endif
2040
Guido van Rossumd8225182000-03-10 22:33:05 +00002041#ifdef __cplusplus
2042}
2043#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002044#endif /* !Py_UNICODEOBJECT_H */