blob: 1b4522df9a24de3742a367971487986a93ff8014 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200209 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
291 PyUnicode_WCHAR_KIND (0): wchar_t*
292 PyUnicode_1BYTE_KIND (1): Py_UCS1*
293 PyUnicode_2BYTE_KIND (2): Py_UCS2*
294 PyUnicode_4BYTE_KIND (3): Py_UCS4*
295 */
296 unsigned int kind:2;
297 /* Compact is with respect to the allocation scheme. Compact unicode
298 objects only require one memory block while non-compact objects use
299 one block for the PyUnicodeObject struct and another for its data
300 buffer. */
301 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200302 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
303 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
304 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200305 unsigned int ascii:1;
306 /* The ready flag indicates whether the object layout is initialized
307 completely. This means that this is either a compact object, or
308 the data pointer is filled out. The bit is redundant, and helps
309 to minimize the test in PyUnicode_IS_READY(). */
310 unsigned int ready:1;
311 } state;
312 wchar_t *wstr; /* wchar_t representation (null-terminated) */
313} PyASCIIObject;
314
315/* Non-ASCII strings allocated through PyUnicode_New use the
316 PyCompactUnicodeOject structure. state.compact is set, and the data
317 immediately follow the structure. */
318typedef struct {
319 PyASCIIObject _base;
320 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
321 * terminating \0. */
322 char *utf8; /* UTF-8 representation (null-terminated) */
323 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
324 * surrogates count as two code points. */
325} PyCompactUnicodeObject;
326
327/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
328 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200329 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330typedef struct {
331 PyCompactUnicodeObject _base;
332 union {
333 void *any;
334 Py_UCS1 *latin1;
335 Py_UCS2 *ucs2;
336 Py_UCS4 *ucs4;
337 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000338} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000339#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000340
Mark Hammond91a681d2002-08-12 07:21:58 +0000341PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000342PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000343
Thomas Wouters27d517b2007-02-25 20:39:11 +0000344#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000345 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
346#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000347
348/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000349#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350
351#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200352 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353 ((PyASCIIObject*)op)->length : \
354 ((PyCompactUnicodeObject*)op)->wstr_length)
355
356/* Returns the deprecated Py_UNICODE representation's size in code units
357 (this includes surrogate pairs as 2 units).
358 If the Py_UNICODE representation is not available, it will be computed
359 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
360
Guido van Rossumd8225182000-03-10 22:33:05 +0000361#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 (assert(PyUnicode_Check(op)), \
363 (((PyASCIIObject *)(op))->wstr) ? \
364 PyUnicode_WSTR_LENGTH(op) : \
365 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
366 PyUnicode_WSTR_LENGTH(op)))
367
Guido van Rossumd8225182000-03-10 22:33:05 +0000368#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
370
371/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
372 representation on demand. Using this macro is very inefficient now,
373 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
374 use PyUnicode_WRITE() and PyUnicode_READ(). */
375
Guido van Rossumd8225182000-03-10 22:33:05 +0000376#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 (assert(PyUnicode_Check(op)), \
378 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
379 PyUnicode_AsUnicode((PyObject *)(op)))
380
Guido van Rossumd8225182000-03-10 22:33:05 +0000381#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 ((const char *)(PyUnicode_AS_UNICODE(op)))
383
384
385/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
386
387/* Values for PyUnicodeObject.state: */
388
389/* Interning state. */
390#define SSTATE_NOT_INTERNED 0
391#define SSTATE_INTERNED_MORTAL 1
392#define SSTATE_INTERNED_IMMORTAL 2
393
Victor Stinnera3b334d2011-10-03 13:53:37 +0200394/* Return true if the string contains only ASCII characters, or 0 if not. The
395 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
396 or Ready calls are performed. */
397#define PyUnicode_IS_ASCII(op) \
398 (((PyASCIIObject*)op)->state.ascii)
399
400/* Return true if the string is compact or 0 if not.
401 No type checks or Ready calls are performed. */
402#define PyUnicode_IS_COMPACT(op) \
403 (((PyASCIIObject*)(op))->state.compact)
404
405/* Return true if the string is a compact ASCII string (use PyASCIIObject
406 structure), or 0 if not. No type checks or Ready calls are performed. */
407#define PyUnicode_IS_COMPACT_ASCII(op) \
408 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409
410/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200412 has not been called yet. */
413#define PyUnicode_WCHAR_KIND 0
414
415/* Return values of the PyUnicode_KIND() macro: */
416
417#define PyUnicode_1BYTE_KIND 1
418#define PyUnicode_2BYTE_KIND 2
419#define PyUnicode_4BYTE_KIND 3
420
421
422/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200423 this can be 1, 2 or 4.
424
425 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200426#define PyUnicode_CHARACTER_SIZE(op) \
427 (1 << (PyUnicode_KIND(op) - 1))
428
429/* Return pointers to the canonical representation casted as unsigned char,
430 Py_UCS2, or Py_UCS4 for direct character access.
431 No checks are performed, use PyUnicode_CHARACTER_SIZE or
432 PyUnicode_KIND() before to ensure these will work correctly. */
433
434#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
435#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
436#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
437
Victor Stinner157f83f2011-09-28 21:41:31 +0200438/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439#define PyUnicode_KIND(op) \
440 (assert(PyUnicode_Check(op)), \
441 assert(PyUnicode_IS_READY(op)), \
442 ((PyASCIIObject *)(op))->state.kind)
443
Victor Stinner157f83f2011-09-28 21:41:31 +0200444/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445#define _PyUnicode_COMPACT_DATA(op) \
446 (PyUnicode_IS_COMPACT_ASCII(op) ? \
447 ((void*)((PyASCIIObject*)(op) + 1)) : \
448 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
449
450#define _PyUnicode_NONCOMPACT_DATA(op) \
451 (assert(((PyUnicodeObject*)(op))->data.any), \
452 ((((PyUnicodeObject *)(op))->data.any)))
453
454#define PyUnicode_DATA(op) \
455 (assert(PyUnicode_Check(op)), \
456 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
457 _PyUnicode_NONCOMPACT_DATA(op))
458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200460 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461
Victor Stinner4584a5b2011-10-01 02:39:37 +0200462 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
464
465/* In the access macros below, "kind" may be evaluated more than once.
466 All other macro parameters are evaluated exactly once, so it is safe
467 to put side effects into them (such as increasing the index). */
468
469/* Write into the canonical representation, this macro does not do any sanity
470 checks and is intended for usage in loops. The caller should cache the
471 kind and data pointers optained form other macro calls.
472 index is the index in the string (starts at 0) and value is the new
473 code point value which shoule be written to that location. */
474#define PyUnicode_WRITE(kind, data, index, value) \
475 do { \
476 switch ((kind)) { \
477 case PyUnicode_1BYTE_KIND: { \
478 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
479 break; \
480 } \
481 case PyUnicode_2BYTE_KIND: { \
482 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
483 break; \
484 } \
485 default: { \
486 assert((kind) == PyUnicode_4BYTE_KIND); \
487 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
488 } \
489 } \
490 } while (0)
491
492/* Read a code point form the string's canonical representation. No checks
493 or ready calls are performed. */
494#define PyUnicode_READ(kind, data, index) \
495 ((Py_UCS4) \
496 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200497 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498 ((kind) == PyUnicode_2BYTE_KIND ? \
499 ((const Py_UCS2 *)(data))[(index)] : \
500 ((const Py_UCS4 *)(data))[(index)] \
501 ) \
502 ))
503
504/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
505 calls PyUnicode_KIND() and might call it twice. For single reads, use
506 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
507 cache kind and use PyUnicode_READ instead. */
508#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200509 (assert(PyUnicode_Check(unicode)), \
510 assert(PyUnicode_IS_READY(unicode)), \
511 (Py_UCS4) \
512 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
513 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
514 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
515 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
516 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
517 ) \
518 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200519
520/* Returns the length of the unicode string. The caller has to make sure that
521 the string has it's canonical representation set before calling
522 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
523#define PyUnicode_GET_LENGTH(op) \
524 (assert(PyUnicode_Check(op)), \
525 assert(PyUnicode_IS_READY(op)), \
526 ((PyASCIIObject *)(op))->length)
527
528
529/* Fast check to determine whether an object is ready. Equivalent to
530 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
531
532#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
533
Victor Stinnera3b334d2011-10-03 13:53:37 +0200534/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200536 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537 Returns 0 on success and -1 on errors. */
538#define PyUnicode_READY(op) \
539 (assert(PyUnicode_Check(op)), \
540 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200541 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543/* Return a maximum character value which is suitable for creating another
544 string based on op. This is always an approximation but more efficient
545 than interating over the string. */
546#define PyUnicode_MAX_CHAR_VALUE(op) \
547 (assert(PyUnicode_IS_READY(op)), \
548 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
549 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
550 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
551 (0x7fU) : (0xffU) \
552 ) : \
553 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
554 (0xffffU) : (0x10ffffU) \
555 ))))
556
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000557#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000558
559/* --- Constants ---------------------------------------------------------- */
560
561/* This Unicode character will be used as replacement character during
562 decoding if the errors argument is set to "replace". Note: the
563 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
564 Unicode 3.0. */
565
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200566#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000567
568/* === Public API ========================================================= */
569
570/* --- Plain Py_UNICODE --------------------------------------------------- */
571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572/* With PEP 393, this is the recommended way to allocate a new unicode object.
573 This function will allocate the object and its buffer in a single memory
574 block. Objects created using this function are not resizable. */
575#ifndef Py_LIMITED_API
576PyAPI_FUNC(PyObject*) PyUnicode_New(
577 Py_ssize_t size, /* Number of code points in the new string */
578 Py_UCS4 maxchar /* maximum code point value in the string */
579 );
580#endif
581
Victor Stinnerd8f65102011-09-29 19:43:17 +0200582/* Initializes the canonical string representation from a the deprecated
583 wstr/Py_UNICODE representation. This function is used to convert Unicode
584 objects which were created using the old API to the new flexible format
585 introduced with PEP 393.
586
587 Don't call this function directly, use the public PyUnicode_READY() macro
588 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589#ifndef Py_LIMITED_API
590PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200591 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 );
593#endif
594
Victor Stinner034f6cf2011-09-30 02:26:44 +0200595/* Get a copy of a Unicode string. */
596PyAPI_FUNC(PyObject*) PyUnicode_Copy(
597 PyObject *unicode
598 );
599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200600/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200601 character conversion when necessary and falls back to memcpy if possible.
602
Victor Stinnera0702ab2011-09-29 14:14:38 +0200603 Fail if to is too small (smaller than how_many or smaller than
604 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
605 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200606
607 Return the number of written character, or return -1 and raise an exception
608 on error.
609
610 Pseudo-code:
611
612 how_many = min(how_many, len(from) - from_start)
613 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
614 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200615
616 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200617 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200619PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620 PyObject *to,
621 Py_ssize_t to_start,
622 PyObject *from,
623 Py_ssize_t from_start,
624 Py_ssize_t how_many
625 );
626#endif
627
Guido van Rossumd8225182000-03-10 22:33:05 +0000628/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000629 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000630
631 u may be NULL which causes the contents to be undefined. It is the
632 user's responsibility to fill in the needed data afterwards. Note
633 that modifying the Unicode object contents after construction is
634 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000635
636 The buffer is copied into the new object. */
637
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000638#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000639PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000640 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000641 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000642 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000643#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000644
Georg Brandl952867a2010-06-27 10:17:12 +0000645/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000647 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000648 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000649 );
650
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000651/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200652 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000653PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000654 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000655 );
656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657#ifndef Py_LIMITED_API
658PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
659 int kind,
660 const void *buffer,
661 Py_ssize_t size);
662#endif
663
664PyAPI_FUNC(PyObject*) PyUnicode_Substring(
665 PyObject *str,
666 Py_ssize_t start,
667 Py_ssize_t end);
668
669/* Copy the string into a UCS4 buffer including the null character is copy_null
670 is set. Return NULL and raise an exception on error. Raise a ValueError if
671 the buffer is smaller than the string. Return buffer on success.
672
673 buflen is the length of the buffer in (Py_UCS4) characters. */
674PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
675 PyObject *unicode,
676 Py_UCS4* buffer,
677 Py_ssize_t buflen,
678 int copy_null);
679
680/* Copy the string into a UCS4 buffer. A new buffer is allocated using
681 * PyMem_Malloc; if this fails, NULL is returned with a memory error
682 exception set. */
683PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
684
Guido van Rossumd8225182000-03-10 22:33:05 +0000685/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200686 Py_UNICODE buffer.
687 If the wchar_t/Py_UNICODE representation is not yet available, this
688 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000689
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000690#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000691PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000692 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000693 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000694#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696/* Return a read-only pointer to the Unicode object's internal
697 Py_UNICODE buffer and save the length at size.
698 If the wchar_t/Py_UNICODE representation is not yet available, this
699 function will calculate it. */
700
701#ifndef Py_LIMITED_API
702PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
703 PyObject *unicode, /* Unicode object */
704 Py_ssize_t *size /* location where to save the length */
705 );
706#endif
707
Guido van Rossumd8225182000-03-10 22:33:05 +0000708/* Get the length of the Unicode object. */
709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
711 PyObject *unicode
712);
713
Victor Stinner157f83f2011-09-28 21:41:31 +0200714/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 string representation. */
716
Martin v. Löwis18e16552006-02-15 17:27:45 +0000717PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000719 );
720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721/* Read a character from the string. */
722
723PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
724 PyObject *unicode,
725 Py_ssize_t index
726 );
727
728/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200729 PyUnicode_New, must not be shared, and must not have been hashed yet.
730
731 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732
733PyAPI_FUNC(int) PyUnicode_WriteChar(
734 PyObject *unicode,
735 Py_ssize_t index,
736 Py_UCS4 character
737 );
738
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000739#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000740/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000741PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000742#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000743
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200744/* Resize an Unicode object allocated by the legacy API (e.g.
745 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
746 PyUnicode_New) cannot be resized by this function.
747
748 The length is a number of Py_UNICODE characters (and not the number of code
749 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000750
751 *unicode is modified to point to the new (resized) object and 0
752 returned on success.
753
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200754 If the refcount on the object is 1, the function resizes the string in
755 place, which is usually faster than allocating a new string (and copy
756 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000757
758 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200759 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000760
Mark Hammond91a681d2002-08-12 07:21:58 +0000761PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 PyObject **unicode, /* Pointer to the Unicode object */
763 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000764 );
765
Guido van Rossumd8225182000-03-10 22:33:05 +0000766/* Coerce obj to an Unicode object and return a reference with
767 *incremented* refcount.
768
769 Coercion is done in the following way:
770
Georg Brandl952867a2010-06-27 10:17:12 +0000771 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000772 under the assumptions that they contain data using the UTF-8
773 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000774
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000775 2. All other objects (including Unicode objects) raise an
776 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000777
778 The API returns NULL in case of an error. The caller is responsible
779 for decref'ing the returned objects.
780
781*/
782
Mark Hammond91a681d2002-08-12 07:21:58 +0000783PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000785 const char *encoding, /* encoding */
786 const char *errors /* error handling */
787 );
788
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000789/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000790 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000792 Unicode objects are passed back as-is (subclasses are converted to
793 true Unicode objects), all other objects are delegated to
794 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000795 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000796
797 The API returns NULL in case of an error. The caller is responsible
798 for decref'ing the returned objects.
799
800*/
801
Mark Hammond91a681d2002-08-12 07:21:58 +0000802PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000804 );
805
Victor Stinner1205f272010-09-11 00:54:47 +0000806PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
807 const char *format, /* ASCII-encoded string */
808 va_list vargs
809 );
810PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
811 const char *format, /* ASCII-encoded string */
812 ...
813 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000814
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000815#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000816/* Format the object based on the format_spec, as defined in PEP 3101
817 (Advanced String Formatting). */
818PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819 PyObject *format_spec,
820 Py_ssize_t start,
821 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000822#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000823
Walter Dörwald16807132007-05-25 13:52:07 +0000824PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
825PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000826PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
827 const char *u /* UTF-8 encoded string */
828 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000829#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000830PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000831#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000832
833/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834#define PyUnicode_CHECK_INTERNED(op) \
835 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000836
Guido van Rossumd8225182000-03-10 22:33:05 +0000837/* --- wchar_t support for platforms which support it --------------------- */
838
839#ifdef HAVE_WCHAR_H
840
Georg Brandl952867a2010-06-27 10:17:12 +0000841/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000842 size.
843
844 The buffer is copied into the new object. */
845
Mark Hammond91a681d2002-08-12 07:21:58 +0000846PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000847 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000848 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000849 );
850
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000851/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000852 most size wchar_t characters are copied.
853
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000854 Note that the resulting wchar_t string may or may not be
855 0-terminated. It is the responsibility of the caller to make sure
856 that the wchar_t string is 0-terminated in case this is required by
857 the application.
858
859 Returns the number of wchar_t characters copied (excluding a
860 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000861 error. */
862
Martin v. Löwis18e16552006-02-15 17:27:45 +0000863PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000864 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000865 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 );
868
Victor Stinner137c34c2010-09-29 10:25:54 +0000869/* Convert the Unicode object to a wide character string. The output string
870 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200871 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000872
873 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
874 on success. On error, returns NULL, *size is undefined and raises a
875 MemoryError. */
876
877PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000878 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000879 Py_ssize_t *size /* number of characters of the result */
880 );
881
Victor Stinner9f789e72011-10-01 03:57:28 +0200882#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200884#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885
Guido van Rossumd8225182000-03-10 22:33:05 +0000886#endif
887
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000888/* --- Unicode ordinals --------------------------------------------------- */
889
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890/* Create a Unicode Object from the given Unicode code point ordinal.
891
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000892 The ordinal must be in range(0x10000) on narrow Python builds
893 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
894 raised in case it is not.
895
896*/
897
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000898PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000899
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000900/* --- Free-list management ----------------------------------------------- */
901
902/* Clear the free list used by the Unicode implementation.
903
904 This can be used to release memory used for objects on the free
905 list back to the Python memory allocator.
906
907*/
908
909PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
910
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000912
913 Many of these APIs take two arguments encoding and errors. These
914 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000915 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000916
Georg Brandl952867a2010-06-27 10:17:12 +0000917 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000918
919 Error handling is set by errors which may also be set to NULL
920 meaning to use the default handling defined for the codec. Default
921 error handling for all builtin codecs is "strict" (ValueErrors are
922 raised).
923
924 The codecs all use a similar interface. Only deviation from the
925 generic ones are documented.
926
927*/
928
Fred Drakecb093fe2000-05-09 19:51:53 +0000929/* --- Manage the default encoding ---------------------------------------- */
930
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000931/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000932 Unicode object unicode and the size of the encoded representation
933 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000934
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000935 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000936
Victor Stinner157f83f2011-09-28 21:41:31 +0200937 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 and subsequent calls will return the same string. The memory is relased
939 when the unicodeobject is deallocated.
940
941 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
942 support the previous internal function with the same behaviour.
943
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000944 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000945 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000946
947 *** If you need to access the Unicode object as UTF-8 bytes string,
948 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000949*/
950
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000951#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000954 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000956#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000957
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000958/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000959 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
962 in the unicodeobject.
963
964 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
965 support the previous internal function with the same behaviour.
966
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000967 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000968 extracted from the returned data.
969
970 *** This API is for interpreter INTERNAL USE ONLY and will likely
971 *** be removed or changed for Python 3.1.
972
973 *** If you need to access the Unicode object as UTF-8 bytes string,
974 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000975
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000976*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000977
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000978#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
980#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000981#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000982
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000983/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000984
Mark Hammond91a681d2002-08-12 07:21:58 +0000985PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000986
Guido van Rossumd8225182000-03-10 22:33:05 +0000987/* --- Generic Codecs ----------------------------------------------------- */
988
989/* Create a Unicode object by decoding the encoded string s of the
990 given size. */
991
Mark Hammond91a681d2002-08-12 07:21:58 +0000992PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000993 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000994 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000995 const char *encoding, /* encoding */
996 const char *errors /* error handling */
997 );
998
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000999/* Decode a Unicode object unicode and return the result as Python
1000 object. */
1001
1002PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001003 PyObject *unicode, /* Unicode object */
1004 const char *encoding, /* encoding */
1005 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001006 );
1007
1008/* Decode a Unicode object unicode and return the result as Unicode
1009 object. */
1010
1011PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 PyObject *unicode, /* Unicode object */
1013 const char *encoding, /* encoding */
1014 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001015 );
1016
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001018 Python string object. */
1019
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001020#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001021PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001022 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 const char *encoding, /* encoding */
1025 const char *errors /* error handling */
1026 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001027#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001028
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001029/* Encodes a Unicode object and returns the result as Python
1030 object. */
1031
1032PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 PyObject *unicode, /* Unicode object */
1034 const char *encoding, /* encoding */
1035 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001036 );
1037
Guido van Rossumd8225182000-03-10 22:33:05 +00001038/* Encodes a Unicode object and returns the result as Python string
1039 object. */
1040
Mark Hammond91a681d2002-08-12 07:21:58 +00001041PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 PyObject *unicode, /* Unicode object */
1043 const char *encoding, /* encoding */
1044 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001045 );
1046
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001047/* Encodes a Unicode object and returns the result as Unicode
1048 object. */
1049
1050PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 PyObject *unicode, /* Unicode object */
1052 const char *encoding, /* encoding */
1053 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001054 );
1055
1056/* Build an encoding map. */
1057
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001058PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1059 PyObject* string /* 256 character map */
1060 );
1061
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062/* --- UTF-7 Codecs ------------------------------------------------------- */
1063
Mark Hammond91a681d2002-08-12 07:21:58 +00001064PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 const char *string, /* UTF-7 encoded string */
1066 Py_ssize_t length, /* size of string */
1067 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 );
1069
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const char *string, /* UTF-7 encoded string */
1072 Py_ssize_t length, /* size of string */
1073 const char *errors, /* error handling */
1074 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001075 );
1076
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001077#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001078PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 const Py_UNICODE *data, /* Unicode char buffer */
1080 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1081 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1082 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1083 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001085#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086
Guido van Rossumd8225182000-03-10 22:33:05 +00001087/* --- UTF-8 Codecs ------------------------------------------------------- */
1088
Mark Hammond91a681d2002-08-12 07:21:58 +00001089PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 const char *string, /* UTF-8 encoded string */
1091 Py_ssize_t length, /* size of string */
1092 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001093 );
1094
Walter Dörwald69652032004-09-07 20:24:22 +00001095PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001096 const char *string, /* UTF-8 encoded string */
1097 Py_ssize_t length, /* size of string */
1098 const char *errors, /* error handling */
1099 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001100 );
1101
Mark Hammond91a681d2002-08-12 07:21:58 +00001102PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001104 );
1105
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001106#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1108 PyObject *unicode,
1109 const char *errors);
1110
Mark Hammond91a681d2002-08-12 07:21:58 +00001111PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 const Py_UNICODE *data, /* Unicode char buffer */
1113 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1114 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001115 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001116#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001117
Walter Dörwald41980ca2007-08-16 21:55:45 +00001118/* --- UTF-32 Codecs ------------------------------------------------------ */
1119
1120/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1121 the corresponding Unicode object.
1122
1123 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001124 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001125
1126 If byteorder is non-NULL, the decoder starts decoding using the
1127 given byte order:
1128
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 *byteorder == -1: little endian
1130 *byteorder == 0: native order
1131 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001132
1133 In native mode, the first four bytes of the stream are checked for a
1134 BOM mark. If found, the BOM mark is analysed, the byte order
1135 adjusted and the BOM skipped. In the other modes, no BOM mark
1136 interpretation is done. After completion, *byteorder is set to the
1137 current byte order at the end of input data.
1138
1139 If byteorder is NULL, the codec starts in native order mode.
1140
1141*/
1142
1143PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 const char *string, /* UTF-32 encoded string */
1145 Py_ssize_t length, /* size of string */
1146 const char *errors, /* error handling */
1147 int *byteorder /* pointer to byteorder to use
1148 0=native;-1=LE,1=BE; updated on
1149 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001150 );
1151
1152PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 const char *string, /* UTF-32 encoded string */
1154 Py_ssize_t length, /* size of string */
1155 const char *errors, /* error handling */
1156 int *byteorder, /* pointer to byteorder to use
1157 0=native;-1=LE,1=BE; updated on
1158 exit */
1159 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001160 );
1161
1162/* Returns a Python string using the UTF-32 encoding in native byte
1163 order. The string always starts with a BOM mark. */
1164
1165PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001167 );
1168
1169/* Returns a Python string object holding the UTF-32 encoded value of
1170 the Unicode data.
1171
1172 If byteorder is not 0, output is written according to the following
1173 byte order:
1174
1175 byteorder == -1: little endian
1176 byteorder == 0: native byte order (writes a BOM mark)
1177 byteorder == 1: big endian
1178
1179 If byteorder is 0, the output string will always start with the
1180 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1181 prepended.
1182
1183*/
1184
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001185#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001186PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001187 const Py_UNICODE *data, /* Unicode char buffer */
1188 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1189 const char *errors, /* error handling */
1190 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001191 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001192#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001193
Guido van Rossumd8225182000-03-10 22:33:05 +00001194/* --- UTF-16 Codecs ------------------------------------------------------ */
1195
Guido van Rossum9e896b32000-04-05 20:11:21 +00001196/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001197 the corresponding Unicode object.
1198
1199 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001201
1202 If byteorder is non-NULL, the decoder starts decoding using the
1203 given byte order:
1204
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 *byteorder == -1: little endian
1206 *byteorder == 0: native order
1207 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001208
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001209 In native mode, the first two bytes of the stream are checked for a
1210 BOM mark. If found, the BOM mark is analysed, the byte order
1211 adjusted and the BOM skipped. In the other modes, no BOM mark
1212 interpretation is done. After completion, *byteorder is set to the
1213 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001214
1215 If byteorder is NULL, the codec starts in native order mode.
1216
1217*/
1218
Mark Hammond91a681d2002-08-12 07:21:58 +00001219PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 const char *string, /* UTF-16 encoded string */
1221 Py_ssize_t length, /* size of string */
1222 const char *errors, /* error handling */
1223 int *byteorder /* pointer to byteorder to use
1224 0=native;-1=LE,1=BE; updated on
1225 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001226 );
1227
Walter Dörwald69652032004-09-07 20:24:22 +00001228PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 const char *string, /* UTF-16 encoded string */
1230 Py_ssize_t length, /* size of string */
1231 const char *errors, /* error handling */
1232 int *byteorder, /* pointer to byteorder to use
1233 0=native;-1=LE,1=BE; updated on
1234 exit */
1235 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001236 );
1237
Guido van Rossumd8225182000-03-10 22:33:05 +00001238/* Returns a Python string using the UTF-16 encoding in native byte
1239 order. The string always starts with a BOM mark. */
1240
Mark Hammond91a681d2002-08-12 07:21:58 +00001241PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001243 );
1244
1245/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001246 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001247
1248 If byteorder is not 0, output is written according to the following
1249 byte order:
1250
1251 byteorder == -1: little endian
1252 byteorder == 0: native byte order (writes a BOM mark)
1253 byteorder == 1: big endian
1254
1255 If byteorder is 0, the output string will always start with the
1256 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1257 prepended.
1258
1259 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1260 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001261 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001262
1263*/
1264
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001265#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001266PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 const Py_UNICODE *data, /* Unicode char buffer */
1268 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1269 const char *errors, /* error handling */
1270 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001271 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001272#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001273
1274/* --- Unicode-Escape Codecs ---------------------------------------------- */
1275
Mark Hammond91a681d2002-08-12 07:21:58 +00001276PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 const char *string, /* Unicode-Escape encoded string */
1278 Py_ssize_t length, /* size of string */
1279 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001280 );
1281
Mark Hammond91a681d2002-08-12 07:21:58 +00001282PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 );
1285
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001286#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001287PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 const Py_UNICODE *data, /* Unicode char buffer */
1289 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001290 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001291#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001292
1293/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1294
Mark Hammond91a681d2002-08-12 07:21:58 +00001295PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 const char *string, /* Raw-Unicode-Escape encoded string */
1297 Py_ssize_t length, /* size of string */
1298 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001299 );
1300
Mark Hammond91a681d2002-08-12 07:21:58 +00001301PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001303 );
1304
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001306PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 const Py_UNICODE *data, /* Unicode char buffer */
1308 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001310#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001311
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001312/* --- Unicode Internal Codec ---------------------------------------------
1313
1314 Only for internal use in _codecsmodule.c */
1315
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001316#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001317PyObject *_PyUnicode_DecodeUnicodeInternal(
1318 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001319 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001320 const char *errors
1321 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001323
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001325
1326 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1327
1328*/
1329
Mark Hammond91a681d2002-08-12 07:21:58 +00001330PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331 const char *string, /* Latin-1 encoded string */
1332 Py_ssize_t length, /* size of string */
1333 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001334 );
1335
Mark Hammond91a681d2002-08-12 07:21:58 +00001336PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001338 );
1339
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001340#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1342 PyObject* unicode,
1343 const char* errors);
1344
Mark Hammond91a681d2002-08-12 07:21:58 +00001345PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 const Py_UNICODE *data, /* Unicode char buffer */
1347 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1348 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001349 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001350#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001351
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001353
1354 Only 7-bit ASCII data is excepted. All other codes generate errors.
1355
1356*/
1357
Mark Hammond91a681d2002-08-12 07:21:58 +00001358PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 const char *string, /* ASCII encoded string */
1360 Py_ssize_t length, /* size of string */
1361 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001362 );
1363
Mark Hammond91a681d2002-08-12 07:21:58 +00001364PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001366 );
1367
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001368#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1370 PyObject* unicode,
1371 const char* errors);
1372
Mark Hammond91a681d2002-08-12 07:21:58 +00001373PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 const Py_UNICODE *data, /* Unicode char buffer */
1375 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1376 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001377 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001378#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001379
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001383
1384 Decoding mappings must map single string characters to single
1385 Unicode characters, integers (which are then interpreted as Unicode
1386 ordinals) or None (meaning "undefined mapping" and causing an
1387 error).
1388
1389 Encoding mappings must map single Unicode characters to single
1390 string characters, integers (which are then interpreted as Latin-1
1391 ordinals) or None (meaning "undefined mapping" and causing an
1392 error).
1393
1394 If a character lookup fails with a LookupError, the character is
1395 copied as-is meaning that its ordinal value will be interpreted as
1396 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1397 to contain those mappings which map characters to different code
1398 points.
1399
1400*/
1401
Mark Hammond91a681d2002-08-12 07:21:58 +00001402PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 const char *string, /* Encoded string */
1404 Py_ssize_t length, /* size of string */
1405 PyObject *mapping, /* character mapping
1406 (char ordinal -> unicode ordinal) */
1407 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001408 );
1409
Mark Hammond91a681d2002-08-12 07:21:58 +00001410PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 PyObject *unicode, /* Unicode object */
1412 PyObject *mapping /* character mapping
1413 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001414 );
1415
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001416#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001417PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 const Py_UNICODE *data, /* Unicode char buffer */
1419 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1420 PyObject *mapping, /* character mapping
1421 (unicode ordinal -> char ordinal) */
1422 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001423 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001424#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001425
1426/* Translate a Py_UNICODE buffer of the given length by applying a
1427 character mapping table to it and return the resulting Unicode
1428 object.
1429
1430 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001432
1433 Mapping tables may be dictionaries or sequences. Unmapped character
1434 ordinals (ones which cause a LookupError) are left untouched and
1435 are copied as-is.
1436
1437*/
1438
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001439#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001440PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 const Py_UNICODE *data, /* Unicode char buffer */
1442 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1443 PyObject *table, /* Translate table */
1444 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001445 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001446#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001447
Victor Stinner99b95382011-07-04 14:23:54 +02001448#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001449
Guido van Rossumefec1152000-03-28 02:01:15 +00001450/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001451
Mark Hammond91a681d2002-08-12 07:21:58 +00001452PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001453 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001454 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001455 const char *errors /* error handling */
1456 );
1457
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001458PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1459 const char *string, /* MBCS encoded string */
1460 Py_ssize_t length, /* size of string */
1461 const char *errors, /* error handling */
1462 Py_ssize_t *consumed /* bytes consumed */
1463 );
1464
Mark Hammond91a681d2002-08-12 07:21:58 +00001465PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001466 PyObject *unicode /* Unicode object */
1467 );
1468
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001469#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001470PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001471 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001472 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001473 const char *errors /* error handling */
1474 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001475#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001476
Victor Stinner99b95382011-07-04 14:23:54 +02001477#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001478
Guido van Rossum9e896b32000-04-05 20:11:21 +00001479/* --- Decimal Encoder ---------------------------------------------------- */
1480
1481/* Takes a Unicode string holding a decimal value and writes it into
1482 an output buffer using standard ASCII digit codes.
1483
1484 The output buffer has to provide at least length+1 bytes of storage
1485 area. The output string is 0-terminated.
1486
1487 The encoder converts whitespace to ' ', decimal characters to their
1488 corresponding ASCII digit and all other Latin-1 characters except
1489 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1490 are treated as errors. This includes embedded NULL bytes.
1491
1492 Error handling is defined by the errors argument:
1493
1494 NULL or "strict": raise a ValueError
1495 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001497 "replace": replaces illegal characters with '?'
1498
1499 Returns 0 on success, -1 on failure.
1500
1501*/
1502
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001503#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001504PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 Py_UNICODE *s, /* Unicode buffer */
1506 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1507 char *output, /* Output buffer; must have size >= length */
1508 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001509 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001510#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001511
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001512/* Transforms code points that have decimal digit property to the
1513 corresponding ASCII digit code points.
1514
1515 Returns a new Unicode string on success, NULL on failure.
1516*/
1517
Georg Brandlb5503082010-12-05 11:40:48 +00001518#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001519PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1520 Py_UNICODE *s, /* Unicode buffer */
1521 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1522 );
Georg Brandlb5503082010-12-05 11:40:48 +00001523#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1526 as argument instead of a raw buffer and length. This function additionally
1527 transforms spaces to ASCII because this is what the callers in longobject,
1528 floatobject, and complexobject did anyways. */
1529
1530#ifndef Py_LIMITED_API
1531PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1532 PyObject *unicode /* Unicode object */
1533 );
1534#endif
1535
Martin v. Löwis011e8422009-05-05 04:43:17 +00001536/* --- File system encoding ---------------------------------------------- */
1537
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001538/* ParseTuple converter: encode str objects to bytes using
1539 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001540
1541PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1542
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001543/* ParseTuple converter: decode bytes objects to unicode using
1544 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1545
1546PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1547
Victor Stinner77c38622010-05-14 15:58:55 +00001548/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1549 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001550
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001551 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1552 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001553
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001554 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001555*/
1556
1557PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1558 const char *s /* encoded string */
1559 );
1560
Victor Stinner77c38622010-05-14 15:58:55 +00001561/* Decode a string using Py_FileSystemDefaultEncoding
1562 and the "surrogateescape" error handler.
1563
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001564 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1565 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001566*/
1567
Martin v. Löwis011e8422009-05-05 04:43:17 +00001568PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1569 const char *s, /* encoded string */
1570 Py_ssize_t size /* size */
1571 );
1572
Victor Stinnerae6265f2010-05-15 16:27:27 +00001573/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001574 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001575
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001576 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1577 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001578*/
1579
1580PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1581 PyObject *unicode
1582 );
1583
Guido van Rossumd8225182000-03-10 22:33:05 +00001584/* --- Methods & Slots ----------------------------------------------------
1585
1586 These are capable of handling Unicode objects and strings on input
1587 (we refer to them as strings in the descriptions) and return
1588 Unicode objects or integers as apporpriate. */
1589
1590/* Concat two strings giving a new Unicode string. */
1591
Mark Hammond91a681d2002-08-12 07:21:58 +00001592PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 PyObject *left, /* Left string */
1594 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001595 );
1596
Walter Dörwald1ab83302007-05-18 17:15:44 +00001597/* Concat two strings and put the result in *pleft
1598 (sets *pleft to NULL on error) */
1599
1600PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 PyObject **pleft, /* Pointer to left string */
1602 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001603 );
1604
1605/* Concat two strings, put the result in *pleft and drop the right object
1606 (sets *pleft to NULL on error) */
1607
1608PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 PyObject **pleft, /* Pointer to left string */
1610 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001611 );
1612
Guido van Rossumd8225182000-03-10 22:33:05 +00001613/* Split a string giving a list of Unicode strings.
1614
1615 If sep is NULL, splitting will be done at all whitespace
1616 substrings. Otherwise, splits occur at the given separator.
1617
1618 At most maxsplit splits will be done. If negative, no limit is set.
1619
1620 Separators are not included in the resulting list.
1621
1622*/
1623
Mark Hammond91a681d2002-08-12 07:21:58 +00001624PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 PyObject *s, /* String to split */
1626 PyObject *sep, /* String separator */
1627 Py_ssize_t maxsplit /* Maxsplit count */
1628 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001629
1630/* Dito, but split at line breaks.
1631
1632 CRLF is considered to be one line break. Line breaks are not
1633 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634
Mark Hammond91a681d2002-08-12 07:21:58 +00001635PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 PyObject *s, /* String to split */
1637 int keepends /* If true, line end markers are included */
1638 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001639
Thomas Wouters477c8d52006-05-27 19:21:47 +00001640/* Partition a string using a given separator. */
1641
1642PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 PyObject *s, /* String to partition */
1644 PyObject *sep /* String separator */
1645 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001646
1647/* Partition a string using a given separator, searching from the end of the
1648 string. */
1649
1650PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 PyObject *s, /* String to partition */
1652 PyObject *sep /* String separator */
1653 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001654
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001655/* Split a string giving a list of Unicode strings.
1656
1657 If sep is NULL, splitting will be done at all whitespace
1658 substrings. Otherwise, splits occur at the given separator.
1659
1660 At most maxsplit splits will be done. But unlike PyUnicode_Split
1661 PyUnicode_RSplit splits from the end of the string. If negative,
1662 no limit is set.
1663
1664 Separators are not included in the resulting list.
1665
1666*/
1667
1668PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 PyObject *s, /* String to split */
1670 PyObject *sep, /* String separator */
1671 Py_ssize_t maxsplit /* Maxsplit count */
1672 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001673
Guido van Rossumd8225182000-03-10 22:33:05 +00001674/* Translate a string by applying a character mapping table to it and
1675 return the resulting Unicode object.
1676
1677 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001679
1680 Mapping tables may be dictionaries or sequences. Unmapped character
1681 ordinals (ones which cause a LookupError) are left untouched and
1682 are copied as-is.
1683
1684*/
1685
Mark Hammond91a681d2002-08-12 07:21:58 +00001686PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 PyObject *str, /* String */
1688 PyObject *table, /* Translate table */
1689 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001690 );
1691
1692/* Join a sequence of strings using the given separator and return
1693 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001694
Mark Hammond91a681d2002-08-12 07:21:58 +00001695PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 PyObject *separator, /* Separator string */
1697 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001698 );
1699
1700/* Return 1 if substr matches str[start:end] at the given tail end, 0
1701 otherwise. */
1702
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 PyObject *str, /* String */
1705 PyObject *substr, /* Prefix or Suffix string */
1706 Py_ssize_t start, /* Start index */
1707 Py_ssize_t end, /* Stop index */
1708 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001709 );
1710
1711/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001712 given search direction or -1 if not found. -2 is returned in case
1713 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001714
Martin v. Löwis18e16552006-02-15 17:27:45 +00001715PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001716 PyObject *str, /* String */
1717 PyObject *substr, /* Substring to find */
1718 Py_ssize_t start, /* Start index */
1719 Py_ssize_t end, /* Stop index */
1720 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001721 );
1722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723/* Like PyUnicode_Find, but search for single character only. */
1724PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1725 PyObject *str,
1726 Py_UCS4 ch,
1727 Py_ssize_t start,
1728 Py_ssize_t end,
1729 int direction
1730 );
1731
Barry Warsaw51ac5802000-03-20 16:36:48 +00001732/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001733
Martin v. Löwis18e16552006-02-15 17:27:45 +00001734PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001735 PyObject *str, /* String */
1736 PyObject *substr, /* Substring to count */
1737 Py_ssize_t start, /* Start index */
1738 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001739 );
1740
Barry Warsaw51ac5802000-03-20 16:36:48 +00001741/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001742 and return the resulting Unicode object. */
1743
Mark Hammond91a681d2002-08-12 07:21:58 +00001744PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 PyObject *str, /* String */
1746 PyObject *substr, /* Substring to find */
1747 PyObject *replstr, /* Substring to replace */
1748 Py_ssize_t maxcount /* Max. number of replacements to apply;
1749 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001750 );
1751
1752/* Compare two strings and return -1, 0, 1 for less than, equal,
1753 greater than resp. */
1754
Mark Hammond91a681d2002-08-12 07:21:58 +00001755PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001756 PyObject *left, /* Left string */
1757 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001758 );
1759
Martin v. Löwis5b222132007-06-10 09:51:05 +00001760PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1761 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001762 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001763 );
1764
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001765/* Rich compare two strings and return one of the following:
1766
1767 - NULL in case an exception was raised
1768 - Py_True or Py_False for successfuly comparisons
1769 - Py_NotImplemented in case the type combination is unknown
1770
1771 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1772 case the conversion of the arguments to Unicode fails with a
1773 UnicodeDecodeError.
1774
1775 Possible values for op:
1776
1777 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1778
1779*/
1780
1781PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001782 PyObject *left, /* Left string */
1783 PyObject *right, /* Right string */
1784 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001785 );
1786
Thomas Wouters7e474022000-07-16 12:04:32 +00001787/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001788 the resulting Unicode string. */
1789
Mark Hammond91a681d2002-08-12 07:21:58 +00001790PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001791 PyObject *format, /* Format string */
1792 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001793 );
1794
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001795/* Checks whether element is contained in container and return 1/0
1796 accordingly.
1797
1798 element has to coerce to an one element Unicode string. -1 is
1799 returned in case of an error. */
1800
Mark Hammond91a681d2002-08-12 07:21:58 +00001801PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 PyObject *container, /* Container string */
1803 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001804 );
1805
Martin v. Löwis47383402007-08-15 07:32:56 +00001806/* Checks whether argument is a valid identifier. */
1807
1808PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1809
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001810#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001811/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001812PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001813 PyUnicodeObject *self,
1814 int striptype,
1815 PyObject *sepobj
1816 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001817#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001818
Eric Smith5807c412008-05-11 21:00:57 +00001819/* Using the current locale, insert the thousands grouping
1820 into the string pointed to by buffer. For the argument descriptions,
1821 see Objects/stringlib/localeutil.h */
1822
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001823#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001824PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1825 Py_ssize_t n_buffer,
1826 Py_UNICODE *digits,
1827 Py_ssize_t n_digits,
1828 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001829#endif
Eric Smith5807c412008-05-11 21:00:57 +00001830
Eric Smitha3b1ac82009-04-03 14:45:06 +00001831/* Using explicit passed-in values, insert the thousands grouping
1832 into the string pointed to by buffer. For the argument descriptions,
1833 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001834#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1836 int kind,
1837 void *buffer,
1838 Py_ssize_t n_buffer,
1839 void *digits,
1840 Py_ssize_t n_digits,
1841 Py_ssize_t min_width,
1842 const char *grouping,
1843 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001844#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001845/* === Characters Type APIs =============================================== */
1846
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001847/* Helper array used by Py_UNICODE_ISSPACE(). */
1848
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001849#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001850PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1851
Guido van Rossumd8225182000-03-10 22:33:05 +00001852/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001853 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001854
1855 These APIs are implemented in Objects/unicodectype.c.
1856
1857*/
1858
Mark Hammond91a681d2002-08-12 07:21:58 +00001859PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001860 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001861 );
1862
Mark Hammond91a681d2002-08-12 07:21:58 +00001863PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001864 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001865 );
1866
Mark Hammond91a681d2002-08-12 07:21:58 +00001867PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001868 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001869 );
1870
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001871PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001872 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001873 );
1874
1875PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001876 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001877 );
1878
Mark Hammond91a681d2002-08-12 07:21:58 +00001879PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001880 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001881 );
1882
Mark Hammond91a681d2002-08-12 07:21:58 +00001883PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001884 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001885 );
1886
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001887PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1888 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001889 );
1890
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001891PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1892 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001893 );
1894
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001895PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1896 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001897 );
1898
Mark Hammond91a681d2002-08-12 07:21:58 +00001899PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001900 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001901 );
1902
Mark Hammond91a681d2002-08-12 07:21:58 +00001903PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001904 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001905 );
1906
Mark Hammond91a681d2002-08-12 07:21:58 +00001907PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001908 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001909 );
1910
Mark Hammond91a681d2002-08-12 07:21:58 +00001911PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001912 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001913 );
1914
Mark Hammond91a681d2002-08-12 07:21:58 +00001915PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001916 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001917 );
1918
Mark Hammond91a681d2002-08-12 07:21:58 +00001919PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001920 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001921 );
1922
Georg Brandl559e5d72008-06-11 18:37:52 +00001923PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001924 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001925 );
1926
Mark Hammond91a681d2002-08-12 07:21:58 +00001927PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001928 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001929 );
1930
Victor Stinneref8d95c2010-08-16 22:03:11 +00001931PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1932 const Py_UNICODE *u
1933 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001934
1935PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001936 Py_UNICODE *s1,
1937 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001938
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001939PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1940 Py_UNICODE *s1, const Py_UNICODE *s2);
1941
Martin v. Löwis5b222132007-06-10 09:51:05 +00001942PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001943 Py_UNICODE *s1,
1944 const Py_UNICODE *s2,
1945 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001946
1947PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001948 const Py_UNICODE *s1,
1949 const Py_UNICODE *s2
1950 );
1951
1952PyAPI_FUNC(int) Py_UNICODE_strncmp(
1953 const Py_UNICODE *s1,
1954 const Py_UNICODE *s2,
1955 size_t n
1956 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001957
1958PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001959 const Py_UNICODE *s,
1960 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001961 );
1962
Victor Stinner331ea922010-08-10 16:37:20 +00001963PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001964 const Py_UNICODE *s,
1965 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001966 );
1967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968PyAPI_FUNC(size_t) Py_UCS4_strlen(
1969 const Py_UCS4 *u
1970 );
1971
1972PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1973 Py_UCS4 *s1,
1974 const Py_UCS4 *s2);
1975
1976PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1977 Py_UCS4 *s1, const Py_UCS4 *s2);
1978
1979PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1980 Py_UCS4 *s1,
1981 const Py_UCS4 *s2,
1982 size_t n);
1983
1984PyAPI_FUNC(int) Py_UCS4_strcmp(
1985 const Py_UCS4 *s1,
1986 const Py_UCS4 *s2
1987 );
1988
1989PyAPI_FUNC(int) Py_UCS4_strncmp(
1990 const Py_UCS4 *s1,
1991 const Py_UCS4 *s2,
1992 size_t n
1993 );
1994
1995PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1996 const Py_UCS4 *s,
1997 Py_UCS4 c
1998 );
1999
2000PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2001 const Py_UCS4 *s,
2002 Py_UCS4 c
2003 );
2004
Victor Stinner71133ff2010-09-01 23:43:53 +00002005/* Create a copy of a unicode string ending with a nul character. Return NULL
2006 and raise a MemoryError exception on memory allocation failure, otherwise
2007 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2008
Victor Stinner46408602010-09-03 16:18:00 +00002009PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002010 PyObject *unicode
2011 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002012#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002013
Guido van Rossumd8225182000-03-10 22:33:05 +00002014#ifdef __cplusplus
2015}
2016#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002017#endif /* !Py_UNICODEOBJECT_H */