blob: 331e8399914173a74fd451be36a768490f2df23e [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200227 * ascii = 0
Victor Stinner85041a52011-10-03 14:42:39 +0200228 * utf8 != data
Victor Stinner910337b2011-10-03 03:20:16 +0200229
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200230 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200231
232 * structure = PyUnicodeObject
233 * kind = PyUnicode_WCHAR_KIND
234 * compact = 0
235 * ready = 0
236 * wstr is not NULL
237 * data.any is NULL
238 * utf8 is NULL
239 * interned = SSTATE_NOT_INTERNED
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200241
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200242 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200243
244 * structure = PyUnicodeObject structure
245 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
246 PyUnicode_4BYTE_KIND
247 * compact = 0
248 * ready = 1
249 * data.any is not NULL
Victor Stinner85041a52011-10-03 14:42:39 +0200250 * utf8 = data if ascii is 1
Victor Stinner910337b2011-10-03 03:20:16 +0200251
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200252 Compact strings use only one memory block (structure + characters),
253 whereas legacy strings use one block for the structure and one block
254 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200255
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200256 Legacy strings are created by PyUnicode_FromUnicode() and
257 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
258 when PyUnicode_READY() is called.
259
260 See also _PyUnicode_CheckConsistency().
261 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000262 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200263 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000264 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265 struct {
266 /*
267 SSTATE_NOT_INTERNED (0)
268 SSTATE_INTERNED_MORTAL (1)
269 SSTATE_INTERNED_IMMORTAL (2)
270
271 If interned != SSTATE_NOT_INTERNED, the two references from the
272 dictionary to this object are *not* counted in ob_refcnt.
273 */
274 unsigned int interned:2;
275 /* Character size:
276
277 PyUnicode_WCHAR_KIND (0): wchar_t*
278 PyUnicode_1BYTE_KIND (1): Py_UCS1*
279 PyUnicode_2BYTE_KIND (2): Py_UCS2*
280 PyUnicode_4BYTE_KIND (3): Py_UCS4*
281 */
282 unsigned int kind:2;
283 /* Compact is with respect to the allocation scheme. Compact unicode
284 objects only require one memory block while non-compact objects use
285 one block for the PyUnicodeObject struct and another for its data
286 buffer. */
287 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200288 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
289 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
290 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200291 unsigned int ascii:1;
292 /* The ready flag indicates whether the object layout is initialized
293 completely. This means that this is either a compact object, or
294 the data pointer is filled out. The bit is redundant, and helps
295 to minimize the test in PyUnicode_IS_READY(). */
296 unsigned int ready:1;
297 } state;
298 wchar_t *wstr; /* wchar_t representation (null-terminated) */
299} PyASCIIObject;
300
301/* Non-ASCII strings allocated through PyUnicode_New use the
302 PyCompactUnicodeOject structure. state.compact is set, and the data
303 immediately follow the structure. */
304typedef struct {
305 PyASCIIObject _base;
306 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
307 * terminating \0. */
308 char *utf8; /* UTF-8 representation (null-terminated) */
309 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
310 * surrogates count as two code points. */
311} PyCompactUnicodeObject;
312
313/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
314 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200315 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316typedef struct {
317 PyCompactUnicodeObject _base;
318 union {
319 void *any;
320 Py_UCS1 *latin1;
321 Py_UCS2 *ucs2;
322 Py_UCS4 *ucs4;
323 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000324} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000325#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000326
Mark Hammond91a681d2002-08-12 07:21:58 +0000327PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000328PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000329
Thomas Wouters27d517b2007-02-25 20:39:11 +0000330#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000331 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
332#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000333
334/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000335#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200336
337#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200338 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 ((PyASCIIObject*)op)->length : \
340 ((PyCompactUnicodeObject*)op)->wstr_length)
341
342/* Returns the deprecated Py_UNICODE representation's size in code units
343 (this includes surrogate pairs as 2 units).
344 If the Py_UNICODE representation is not available, it will be computed
345 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
346
Guido van Rossumd8225182000-03-10 22:33:05 +0000347#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 (assert(PyUnicode_Check(op)), \
349 (((PyASCIIObject *)(op))->wstr) ? \
350 PyUnicode_WSTR_LENGTH(op) : \
351 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
352 PyUnicode_WSTR_LENGTH(op)))
353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
356
357/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
358 representation on demand. Using this macro is very inefficient now,
359 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
360 use PyUnicode_WRITE() and PyUnicode_READ(). */
361
Guido van Rossumd8225182000-03-10 22:33:05 +0000362#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200363 (assert(PyUnicode_Check(op)), \
364 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
365 PyUnicode_AsUnicode((PyObject *)(op)))
366
Guido van Rossumd8225182000-03-10 22:33:05 +0000367#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 ((const char *)(PyUnicode_AS_UNICODE(op)))
369
370
371/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
372
373/* Values for PyUnicodeObject.state: */
374
375/* Interning state. */
376#define SSTATE_NOT_INTERNED 0
377#define SSTATE_INTERNED_MORTAL 1
378#define SSTATE_INTERNED_IMMORTAL 2
379
Victor Stinnera3b334d2011-10-03 13:53:37 +0200380/* Return true if the string contains only ASCII characters, or 0 if not. The
381 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
382 or Ready calls are performed. */
383#define PyUnicode_IS_ASCII(op) \
384 (((PyASCIIObject*)op)->state.ascii)
385
386/* Return true if the string is compact or 0 if not.
387 No type checks or Ready calls are performed. */
388#define PyUnicode_IS_COMPACT(op) \
389 (((PyASCIIObject*)(op))->state.compact)
390
391/* Return true if the string is a compact ASCII string (use PyASCIIObject
392 structure), or 0 if not. No type checks or Ready calls are performed. */
393#define PyUnicode_IS_COMPACT_ASCII(op) \
394 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200395
396/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200397 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398 has not been called yet. */
399#define PyUnicode_WCHAR_KIND 0
400
401/* Return values of the PyUnicode_KIND() macro: */
402
403#define PyUnicode_1BYTE_KIND 1
404#define PyUnicode_2BYTE_KIND 2
405#define PyUnicode_4BYTE_KIND 3
406
407
408/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200409 this can be 1, 2 or 4.
410
411 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200412#define PyUnicode_CHARACTER_SIZE(op) \
413 (1 << (PyUnicode_KIND(op) - 1))
414
415/* Return pointers to the canonical representation casted as unsigned char,
416 Py_UCS2, or Py_UCS4 for direct character access.
417 No checks are performed, use PyUnicode_CHARACTER_SIZE or
418 PyUnicode_KIND() before to ensure these will work correctly. */
419
420#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
421#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
422#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
423
Victor Stinner157f83f2011-09-28 21:41:31 +0200424/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200425#define PyUnicode_KIND(op) \
426 (assert(PyUnicode_Check(op)), \
427 assert(PyUnicode_IS_READY(op)), \
428 ((PyASCIIObject *)(op))->state.kind)
429
Victor Stinner157f83f2011-09-28 21:41:31 +0200430/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431#define _PyUnicode_COMPACT_DATA(op) \
432 (PyUnicode_IS_COMPACT_ASCII(op) ? \
433 ((void*)((PyASCIIObject*)(op) + 1)) : \
434 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
435
436#define _PyUnicode_NONCOMPACT_DATA(op) \
437 (assert(((PyUnicodeObject*)(op))->data.any), \
438 ((((PyUnicodeObject *)(op))->data.any)))
439
440#define PyUnicode_DATA(op) \
441 (assert(PyUnicode_Check(op)), \
442 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
443 _PyUnicode_NONCOMPACT_DATA(op))
444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200446 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447
Victor Stinner4584a5b2011-10-01 02:39:37 +0200448 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
450
451/* In the access macros below, "kind" may be evaluated more than once.
452 All other macro parameters are evaluated exactly once, so it is safe
453 to put side effects into them (such as increasing the index). */
454
455/* Write into the canonical representation, this macro does not do any sanity
456 checks and is intended for usage in loops. The caller should cache the
457 kind and data pointers optained form other macro calls.
458 index is the index in the string (starts at 0) and value is the new
459 code point value which shoule be written to that location. */
460#define PyUnicode_WRITE(kind, data, index, value) \
461 do { \
462 switch ((kind)) { \
463 case PyUnicode_1BYTE_KIND: { \
464 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
465 break; \
466 } \
467 case PyUnicode_2BYTE_KIND: { \
468 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
469 break; \
470 } \
471 default: { \
472 assert((kind) == PyUnicode_4BYTE_KIND); \
473 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
474 } \
475 } \
476 } while (0)
477
478/* Read a code point form the string's canonical representation. No checks
479 or ready calls are performed. */
480#define PyUnicode_READ(kind, data, index) \
481 ((Py_UCS4) \
482 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200483 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200484 ((kind) == PyUnicode_2BYTE_KIND ? \
485 ((const Py_UCS2 *)(data))[(index)] : \
486 ((const Py_UCS4 *)(data))[(index)] \
487 ) \
488 ))
489
490/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
491 calls PyUnicode_KIND() and might call it twice. For single reads, use
492 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
493 cache kind and use PyUnicode_READ instead. */
494#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200495 (assert(PyUnicode_Check(unicode)), \
496 assert(PyUnicode_IS_READY(unicode)), \
497 (Py_UCS4) \
498 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
499 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
500 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
501 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
502 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
503 ) \
504 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200505
506/* Returns the length of the unicode string. The caller has to make sure that
507 the string has it's canonical representation set before calling
508 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
509#define PyUnicode_GET_LENGTH(op) \
510 (assert(PyUnicode_Check(op)), \
511 assert(PyUnicode_IS_READY(op)), \
512 ((PyASCIIObject *)(op))->length)
513
514
515/* Fast check to determine whether an object is ready. Equivalent to
516 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
517
518#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
519
Victor Stinnera3b334d2011-10-03 13:53:37 +0200520/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200522 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523 Returns 0 on success and -1 on errors. */
524#define PyUnicode_READY(op) \
525 (assert(PyUnicode_Check(op)), \
526 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200527 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529/* Return a maximum character value which is suitable for creating another
530 string based on op. This is always an approximation but more efficient
531 than interating over the string. */
532#define PyUnicode_MAX_CHAR_VALUE(op) \
533 (assert(PyUnicode_IS_READY(op)), \
534 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
535 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
536 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
537 (0x7fU) : (0xffU) \
538 ) : \
539 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
540 (0xffffU) : (0x10ffffU) \
541 ))))
542
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000543#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000544
545/* --- Constants ---------------------------------------------------------- */
546
547/* This Unicode character will be used as replacement character during
548 decoding if the errors argument is set to "replace". Note: the
549 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
550 Unicode 3.0. */
551
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200552#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000553
554/* === Public API ========================================================= */
555
556/* --- Plain Py_UNICODE --------------------------------------------------- */
557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558/* With PEP 393, this is the recommended way to allocate a new unicode object.
559 This function will allocate the object and its buffer in a single memory
560 block. Objects created using this function are not resizable. */
561#ifndef Py_LIMITED_API
562PyAPI_FUNC(PyObject*) PyUnicode_New(
563 Py_ssize_t size, /* Number of code points in the new string */
564 Py_UCS4 maxchar /* maximum code point value in the string */
565 );
566#endif
567
Victor Stinnerd8f65102011-09-29 19:43:17 +0200568/* Initializes the canonical string representation from a the deprecated
569 wstr/Py_UNICODE representation. This function is used to convert Unicode
570 objects which were created using the old API to the new flexible format
571 introduced with PEP 393.
572
573 Don't call this function directly, use the public PyUnicode_READY() macro
574 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575#ifndef Py_LIMITED_API
576PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200577 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200578 );
579#endif
580
Victor Stinner034f6cf2011-09-30 02:26:44 +0200581/* Get a copy of a Unicode string. */
582PyAPI_FUNC(PyObject*) PyUnicode_Copy(
583 PyObject *unicode
584 );
585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200586/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200587 character conversion when necessary and falls back to memcpy if possible.
588
Victor Stinnera0702ab2011-09-29 14:14:38 +0200589 Fail if to is too small (smaller than how_many or smaller than
590 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
591 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200592
593 Return the number of written character, or return -1 and raise an exception
594 on error.
595
596 Pseudo-code:
597
598 how_many = min(how_many, len(from) - from_start)
599 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
600 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200601
602 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200603 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200605PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606 PyObject *to,
607 Py_ssize_t to_start,
608 PyObject *from,
609 Py_ssize_t from_start,
610 Py_ssize_t how_many
611 );
612#endif
613
Guido van Rossumd8225182000-03-10 22:33:05 +0000614/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000616
617 u may be NULL which causes the contents to be undefined. It is the
618 user's responsibility to fill in the needed data afterwards. Note
619 that modifying the Unicode object contents after construction is
620 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000621
622 The buffer is copied into the new object. */
623
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000624#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000625PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000626 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000627 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000628 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000629#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000630
Georg Brandl952867a2010-06-27 10:17:12 +0000631/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000632PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000633 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000634 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000635 );
636
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000637/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000639PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000640 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000641 );
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643#ifndef Py_LIMITED_API
644PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
645 int kind,
646 const void *buffer,
647 Py_ssize_t size);
648#endif
649
650PyAPI_FUNC(PyObject*) PyUnicode_Substring(
651 PyObject *str,
652 Py_ssize_t start,
653 Py_ssize_t end);
654
655/* Copy the string into a UCS4 buffer including the null character is copy_null
656 is set. Return NULL and raise an exception on error. Raise a ValueError if
657 the buffer is smaller than the string. Return buffer on success.
658
659 buflen is the length of the buffer in (Py_UCS4) characters. */
660PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
661 PyObject *unicode,
662 Py_UCS4* buffer,
663 Py_ssize_t buflen,
664 int copy_null);
665
666/* Copy the string into a UCS4 buffer. A new buffer is allocated using
667 * PyMem_Malloc; if this fails, NULL is returned with a memory error
668 exception set. */
669PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
670
Guido van Rossumd8225182000-03-10 22:33:05 +0000671/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 Py_UNICODE buffer.
673 If the wchar_t/Py_UNICODE representation is not yet available, this
674 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000675
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000676#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000677PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000679 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000680#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682/* Return a read-only pointer to the Unicode object's internal
683 Py_UNICODE buffer and save the length at size.
684 If the wchar_t/Py_UNICODE representation is not yet available, this
685 function will calculate it. */
686
687#ifndef Py_LIMITED_API
688PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
689 PyObject *unicode, /* Unicode object */
690 Py_ssize_t *size /* location where to save the length */
691 );
692#endif
693
Guido van Rossumd8225182000-03-10 22:33:05 +0000694/* Get the length of the Unicode object. */
695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
697 PyObject *unicode
698);
699
Victor Stinner157f83f2011-09-28 21:41:31 +0200700/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 string representation. */
702
Martin v. Löwis18e16552006-02-15 17:27:45 +0000703PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000705 );
706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707/* Read a character from the string. */
708
709PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
710 PyObject *unicode,
711 Py_ssize_t index
712 );
713
714/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200715 PyUnicode_New, must not be shared, and must not have been hashed yet.
716
717 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718
719PyAPI_FUNC(int) PyUnicode_WriteChar(
720 PyObject *unicode,
721 Py_ssize_t index,
722 Py_UCS4 character
723 );
724
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000725#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000726/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000727PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000728#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000729
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200730/* Resize an Unicode object allocated by the legacy API (e.g.
731 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
732 PyUnicode_New) cannot be resized by this function.
733
734 The length is a number of Py_UNICODE characters (and not the number of code
735 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000736
737 *unicode is modified to point to the new (resized) object and 0
738 returned on success.
739
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200740 If the refcount on the object is 1, the function resizes the string in
741 place, which is usually faster than allocating a new string (and copy
742 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000743
744 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200745 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000746
Mark Hammond91a681d2002-08-12 07:21:58 +0000747PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 PyObject **unicode, /* Pointer to the Unicode object */
749 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000750 );
751
Guido van Rossumd8225182000-03-10 22:33:05 +0000752/* Coerce obj to an Unicode object and return a reference with
753 *incremented* refcount.
754
755 Coercion is done in the following way:
756
Georg Brandl952867a2010-06-27 10:17:12 +0000757 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000758 under the assumptions that they contain data using the UTF-8
759 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000760
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000761 2. All other objects (including Unicode objects) raise an
762 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000763
764 The API returns NULL in case of an error. The caller is responsible
765 for decref'ing the returned objects.
766
767*/
768
Mark Hammond91a681d2002-08-12 07:21:58 +0000769PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000770 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000771 const char *encoding, /* encoding */
772 const char *errors /* error handling */
773 );
774
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000775/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000776 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000778 Unicode objects are passed back as-is (subclasses are converted to
779 true Unicode objects), all other objects are delegated to
780 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000781 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000782
783 The API returns NULL in case of an error. The caller is responsible
784 for decref'ing the returned objects.
785
786*/
787
Mark Hammond91a681d2002-08-12 07:21:58 +0000788PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000790 );
791
Victor Stinner1205f272010-09-11 00:54:47 +0000792PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
793 const char *format, /* ASCII-encoded string */
794 va_list vargs
795 );
796PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
797 const char *format, /* ASCII-encoded string */
798 ...
799 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000800
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000801#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000802/* Format the object based on the format_spec, as defined in PEP 3101
803 (Advanced String Formatting). */
804PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 PyObject *format_spec,
806 Py_ssize_t start,
807 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000808#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000809
Walter Dörwald16807132007-05-25 13:52:07 +0000810PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
811PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000812PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
813 const char *u /* UTF-8 encoded string */
814 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000815#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000816PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000817#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000818
819/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820#define PyUnicode_CHECK_INTERNED(op) \
821 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000822
Guido van Rossumd8225182000-03-10 22:33:05 +0000823/* --- wchar_t support for platforms which support it --------------------- */
824
825#ifdef HAVE_WCHAR_H
826
Georg Brandl952867a2010-06-27 10:17:12 +0000827/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000828 size.
829
830 The buffer is copied into the new object. */
831
Mark Hammond91a681d2002-08-12 07:21:58 +0000832PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000833 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000834 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000835 );
836
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000837/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000838 most size wchar_t characters are copied.
839
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000840 Note that the resulting wchar_t string may or may not be
841 0-terminated. It is the responsibility of the caller to make sure
842 that the wchar_t string is 0-terminated in case this is required by
843 the application.
844
845 Returns the number of wchar_t characters copied (excluding a
846 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000847 error. */
848
Martin v. Löwis18e16552006-02-15 17:27:45 +0000849PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000850 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000851 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000852 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000853 );
854
Victor Stinner137c34c2010-09-29 10:25:54 +0000855/* Convert the Unicode object to a wide character string. The output string
856 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200857 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000858
859 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
860 on success. On error, returns NULL, *size is undefined and raises a
861 MemoryError. */
862
863PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000864 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000865 Py_ssize_t *size /* number of characters of the result */
866 );
867
Victor Stinner9f789e72011-10-01 03:57:28 +0200868#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200870#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200871
Guido van Rossumd8225182000-03-10 22:33:05 +0000872#endif
873
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000874/* --- Unicode ordinals --------------------------------------------------- */
875
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876/* Create a Unicode Object from the given Unicode code point ordinal.
877
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000878 The ordinal must be in range(0x10000) on narrow Python builds
879 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
880 raised in case it is not.
881
882*/
883
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000884PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000885
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000886/* --- Free-list management ----------------------------------------------- */
887
888/* Clear the free list used by the Unicode implementation.
889
890 This can be used to release memory used for objects on the free
891 list back to the Python memory allocator.
892
893*/
894
895PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
896
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000898
899 Many of these APIs take two arguments encoding and errors. These
900 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000901 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000902
Georg Brandl952867a2010-06-27 10:17:12 +0000903 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000904
905 Error handling is set by errors which may also be set to NULL
906 meaning to use the default handling defined for the codec. Default
907 error handling for all builtin codecs is "strict" (ValueErrors are
908 raised).
909
910 The codecs all use a similar interface. Only deviation from the
911 generic ones are documented.
912
913*/
914
Fred Drakecb093fe2000-05-09 19:51:53 +0000915/* --- Manage the default encoding ---------------------------------------- */
916
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000917/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000918 Unicode object unicode and the size of the encoded representation
919 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000920
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000921 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000922
Victor Stinner157f83f2011-09-28 21:41:31 +0200923 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200924 and subsequent calls will return the same string. The memory is relased
925 when the unicodeobject is deallocated.
926
927 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
928 support the previous internal function with the same behaviour.
929
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000930 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000931 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000932
933 *** If you need to access the Unicode object as UTF-8 bytes string,
934 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000935*/
936
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000937#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000940 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200941#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000942#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000943
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000944/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000945 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
948 in the unicodeobject.
949
950 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
951 support the previous internal function with the same behaviour.
952
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000953 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000954 extracted from the returned data.
955
956 *** This API is for interpreter INTERNAL USE ONLY and will likely
957 *** be removed or changed for Python 3.1.
958
959 *** If you need to access the Unicode object as UTF-8 bytes string,
960 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000961
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000962*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000963
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000964#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
966#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000967#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000968
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000969/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000970
Mark Hammond91a681d2002-08-12 07:21:58 +0000971PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000972
Guido van Rossumd8225182000-03-10 22:33:05 +0000973/* --- Generic Codecs ----------------------------------------------------- */
974
975/* Create a Unicode object by decoding the encoded string s of the
976 given size. */
977
Mark Hammond91a681d2002-08-12 07:21:58 +0000978PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000979 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000980 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000981 const char *encoding, /* encoding */
982 const char *errors /* error handling */
983 );
984
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000985/* Decode a Unicode object unicode and return the result as Python
986 object. */
987
988PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989 PyObject *unicode, /* Unicode object */
990 const char *encoding, /* encoding */
991 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000992 );
993
994/* Decode a Unicode object unicode and return the result as Unicode
995 object. */
996
997PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000998 PyObject *unicode, /* Unicode object */
999 const char *encoding, /* encoding */
1000 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001001 );
1002
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001003/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001004 Python string object. */
1005
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001006#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001007PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001008 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001009 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001010 const char *encoding, /* encoding */
1011 const char *errors /* error handling */
1012 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001013#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001014
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001015/* Encodes a Unicode object and returns the result as Python
1016 object. */
1017
1018PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019 PyObject *unicode, /* Unicode object */
1020 const char *encoding, /* encoding */
1021 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001022 );
1023
Guido van Rossumd8225182000-03-10 22:33:05 +00001024/* Encodes a Unicode object and returns the result as Python string
1025 object. */
1026
Mark Hammond91a681d2002-08-12 07:21:58 +00001027PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001028 PyObject *unicode, /* Unicode object */
1029 const char *encoding, /* encoding */
1030 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001031 );
1032
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001033/* Encodes a Unicode object and returns the result as Unicode
1034 object. */
1035
1036PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037 PyObject *unicode, /* Unicode object */
1038 const char *encoding, /* encoding */
1039 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001040 );
1041
1042/* Build an encoding map. */
1043
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001044PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1045 PyObject* string /* 256 character map */
1046 );
1047
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048/* --- UTF-7 Codecs ------------------------------------------------------- */
1049
Mark Hammond91a681d2002-08-12 07:21:58 +00001050PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 const char *string, /* UTF-7 encoded string */
1052 Py_ssize_t length, /* size of string */
1053 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001054 );
1055
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001056PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 const char *string, /* UTF-7 encoded string */
1058 Py_ssize_t length, /* size of string */
1059 const char *errors, /* error handling */
1060 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001061 );
1062
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001063#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001064PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 const Py_UNICODE *data, /* Unicode char buffer */
1066 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1067 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1068 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1069 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001071#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072
Guido van Rossumd8225182000-03-10 22:33:05 +00001073/* --- UTF-8 Codecs ------------------------------------------------------- */
1074
Mark Hammond91a681d2002-08-12 07:21:58 +00001075PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001076 const char *string, /* UTF-8 encoded string */
1077 Py_ssize_t length, /* size of string */
1078 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001079 );
1080
Walter Dörwald69652032004-09-07 20:24:22 +00001081PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001082 const char *string, /* UTF-8 encoded string */
1083 Py_ssize_t length, /* size of string */
1084 const char *errors, /* error handling */
1085 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001086 );
1087
Mark Hammond91a681d2002-08-12 07:21:58 +00001088PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001090 );
1091
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001092#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1094 PyObject *unicode,
1095 const char *errors);
1096
Mark Hammond91a681d2002-08-12 07:21:58 +00001097PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 const Py_UNICODE *data, /* Unicode char buffer */
1099 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1100 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001101 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001102#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001103
Walter Dörwald41980ca2007-08-16 21:55:45 +00001104/* --- UTF-32 Codecs ------------------------------------------------------ */
1105
1106/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1107 the corresponding Unicode object.
1108
1109 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001111
1112 If byteorder is non-NULL, the decoder starts decoding using the
1113 given byte order:
1114
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 *byteorder == -1: little endian
1116 *byteorder == 0: native order
1117 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001118
1119 In native mode, the first four bytes of the stream are checked for a
1120 BOM mark. If found, the BOM mark is analysed, the byte order
1121 adjusted and the BOM skipped. In the other modes, no BOM mark
1122 interpretation is done. After completion, *byteorder is set to the
1123 current byte order at the end of input data.
1124
1125 If byteorder is NULL, the codec starts in native order mode.
1126
1127*/
1128
1129PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 const char *string, /* UTF-32 encoded string */
1131 Py_ssize_t length, /* size of string */
1132 const char *errors, /* error handling */
1133 int *byteorder /* pointer to byteorder to use
1134 0=native;-1=LE,1=BE; updated on
1135 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001136 );
1137
1138PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001139 const char *string, /* UTF-32 encoded string */
1140 Py_ssize_t length, /* size of string */
1141 const char *errors, /* error handling */
1142 int *byteorder, /* pointer to byteorder to use
1143 0=native;-1=LE,1=BE; updated on
1144 exit */
1145 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001146 );
1147
1148/* Returns a Python string using the UTF-32 encoding in native byte
1149 order. The string always starts with a BOM mark. */
1150
1151PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001152 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001153 );
1154
1155/* Returns a Python string object holding the UTF-32 encoded value of
1156 the Unicode data.
1157
1158 If byteorder is not 0, output is written according to the following
1159 byte order:
1160
1161 byteorder == -1: little endian
1162 byteorder == 0: native byte order (writes a BOM mark)
1163 byteorder == 1: big endian
1164
1165 If byteorder is 0, the output string will always start with the
1166 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1167 prepended.
1168
1169*/
1170
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001171#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001172PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 const Py_UNICODE *data, /* Unicode char buffer */
1174 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1175 const char *errors, /* error handling */
1176 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001177 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001178#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001179
Guido van Rossumd8225182000-03-10 22:33:05 +00001180/* --- UTF-16 Codecs ------------------------------------------------------ */
1181
Guido van Rossum9e896b32000-04-05 20:11:21 +00001182/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001183 the corresponding Unicode object.
1184
1185 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001187
1188 If byteorder is non-NULL, the decoder starts decoding using the
1189 given byte order:
1190
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001191 *byteorder == -1: little endian
1192 *byteorder == 0: native order
1193 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001194
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001195 In native mode, the first two bytes of the stream are checked for a
1196 BOM mark. If found, the BOM mark is analysed, the byte order
1197 adjusted and the BOM skipped. In the other modes, no BOM mark
1198 interpretation is done. After completion, *byteorder is set to the
1199 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001200
1201 If byteorder is NULL, the codec starts in native order mode.
1202
1203*/
1204
Mark Hammond91a681d2002-08-12 07:21:58 +00001205PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 const char *string, /* UTF-16 encoded string */
1207 Py_ssize_t length, /* size of string */
1208 const char *errors, /* error handling */
1209 int *byteorder /* pointer to byteorder to use
1210 0=native;-1=LE,1=BE; updated on
1211 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001212 );
1213
Walter Dörwald69652032004-09-07 20:24:22 +00001214PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 const char *string, /* UTF-16 encoded string */
1216 Py_ssize_t length, /* size of string */
1217 const char *errors, /* error handling */
1218 int *byteorder, /* pointer to byteorder to use
1219 0=native;-1=LE,1=BE; updated on
1220 exit */
1221 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001222 );
1223
Guido van Rossumd8225182000-03-10 22:33:05 +00001224/* Returns a Python string using the UTF-16 encoding in native byte
1225 order. The string always starts with a BOM mark. */
1226
Mark Hammond91a681d2002-08-12 07:21:58 +00001227PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001229 );
1230
1231/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001232 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001233
1234 If byteorder is not 0, output is written according to the following
1235 byte order:
1236
1237 byteorder == -1: little endian
1238 byteorder == 0: native byte order (writes a BOM mark)
1239 byteorder == 1: big endian
1240
1241 If byteorder is 0, the output string will always start with the
1242 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1243 prepended.
1244
1245 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1246 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001247 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001248
1249*/
1250
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001251#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001252PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 const Py_UNICODE *data, /* Unicode char buffer */
1254 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1255 const char *errors, /* error handling */
1256 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001257 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001258#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001259
1260/* --- Unicode-Escape Codecs ---------------------------------------------- */
1261
Mark Hammond91a681d2002-08-12 07:21:58 +00001262PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 const char *string, /* Unicode-Escape encoded string */
1264 Py_ssize_t length, /* size of string */
1265 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001266 );
1267
Mark Hammond91a681d2002-08-12 07:21:58 +00001268PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001270 );
1271
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001272#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001273PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 const Py_UNICODE *data, /* Unicode char buffer */
1275 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001276 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001277#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001278
1279/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1280
Mark Hammond91a681d2002-08-12 07:21:58 +00001281PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 const char *string, /* Raw-Unicode-Escape encoded string */
1283 Py_ssize_t length, /* size of string */
1284 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001285 );
1286
Mark Hammond91a681d2002-08-12 07:21:58 +00001287PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001289 );
1290
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001291#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001292PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 const Py_UNICODE *data, /* Unicode char buffer */
1294 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001296#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001297
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001298/* --- Unicode Internal Codec ---------------------------------------------
1299
1300 Only for internal use in _codecsmodule.c */
1301
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001302#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001303PyObject *_PyUnicode_DecodeUnicodeInternal(
1304 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001305 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001306 const char *errors
1307 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001308#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001309
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001311
1312 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1313
1314*/
1315
Mark Hammond91a681d2002-08-12 07:21:58 +00001316PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 const char *string, /* Latin-1 encoded string */
1318 Py_ssize_t length, /* size of string */
1319 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001320 );
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001324 );
1325
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001326#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1328 PyObject* unicode,
1329 const char* errors);
1330
Mark Hammond91a681d2002-08-12 07:21:58 +00001331PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 const Py_UNICODE *data, /* Unicode char buffer */
1333 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1334 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001335 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001336#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001337
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001339
1340 Only 7-bit ASCII data is excepted. All other codes generate errors.
1341
1342*/
1343
Mark Hammond91a681d2002-08-12 07:21:58 +00001344PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 const char *string, /* ASCII encoded string */
1346 Py_ssize_t length, /* size of string */
1347 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001348 );
1349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001352 );
1353
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001354#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1356 PyObject* unicode,
1357 const char* errors);
1358
Mark Hammond91a681d2002-08-12 07:21:58 +00001359PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 const Py_UNICODE *data, /* Unicode char buffer */
1361 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1362 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001363 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001364#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001369
1370 Decoding mappings must map single string characters to single
1371 Unicode characters, integers (which are then interpreted as Unicode
1372 ordinals) or None (meaning "undefined mapping" and causing an
1373 error).
1374
1375 Encoding mappings must map single Unicode characters to single
1376 string characters, integers (which are then interpreted as Latin-1
1377 ordinals) or None (meaning "undefined mapping" and causing an
1378 error).
1379
1380 If a character lookup fails with a LookupError, the character is
1381 copied as-is meaning that its ordinal value will be interpreted as
1382 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1383 to contain those mappings which map characters to different code
1384 points.
1385
1386*/
1387
Mark Hammond91a681d2002-08-12 07:21:58 +00001388PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 const char *string, /* Encoded string */
1390 Py_ssize_t length, /* size of string */
1391 PyObject *mapping, /* character mapping
1392 (char ordinal -> unicode ordinal) */
1393 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001394 );
1395
Mark Hammond91a681d2002-08-12 07:21:58 +00001396PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 PyObject *unicode, /* Unicode object */
1398 PyObject *mapping /* character mapping
1399 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001400 );
1401
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001402#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001403PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 const Py_UNICODE *data, /* Unicode char buffer */
1405 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1406 PyObject *mapping, /* character mapping
1407 (unicode ordinal -> char ordinal) */
1408 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001409 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001410#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001411
1412/* Translate a Py_UNICODE buffer of the given length by applying a
1413 character mapping table to it and return the resulting Unicode
1414 object.
1415
1416 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001418
1419 Mapping tables may be dictionaries or sequences. Unmapped character
1420 ordinals (ones which cause a LookupError) are left untouched and
1421 are copied as-is.
1422
1423*/
1424
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001425#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001426PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 const Py_UNICODE *data, /* Unicode char buffer */
1428 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1429 PyObject *table, /* Translate table */
1430 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001431 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001432#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001433
Victor Stinner99b95382011-07-04 14:23:54 +02001434#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001435
Guido van Rossumefec1152000-03-28 02:01:15 +00001436/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001437
Mark Hammond91a681d2002-08-12 07:21:58 +00001438PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001439 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001440 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001441 const char *errors /* error handling */
1442 );
1443
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001444PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1445 const char *string, /* MBCS encoded string */
1446 Py_ssize_t length, /* size of string */
1447 const char *errors, /* error handling */
1448 Py_ssize_t *consumed /* bytes consumed */
1449 );
1450
Mark Hammond91a681d2002-08-12 07:21:58 +00001451PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001452 PyObject *unicode /* Unicode object */
1453 );
1454
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001455#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001456PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001457 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001458 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001459 const char *errors /* error handling */
1460 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001461#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001462
Victor Stinner99b95382011-07-04 14:23:54 +02001463#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001464
Guido van Rossum9e896b32000-04-05 20:11:21 +00001465/* --- Decimal Encoder ---------------------------------------------------- */
1466
1467/* Takes a Unicode string holding a decimal value and writes it into
1468 an output buffer using standard ASCII digit codes.
1469
1470 The output buffer has to provide at least length+1 bytes of storage
1471 area. The output string is 0-terminated.
1472
1473 The encoder converts whitespace to ' ', decimal characters to their
1474 corresponding ASCII digit and all other Latin-1 characters except
1475 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1476 are treated as errors. This includes embedded NULL bytes.
1477
1478 Error handling is defined by the errors argument:
1479
1480 NULL or "strict": raise a ValueError
1481 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001483 "replace": replaces illegal characters with '?'
1484
1485 Returns 0 on success, -1 on failure.
1486
1487*/
1488
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001489#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001490PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 Py_UNICODE *s, /* Unicode buffer */
1492 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1493 char *output, /* Output buffer; must have size >= length */
1494 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001495 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001496#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001497
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001498/* Transforms code points that have decimal digit property to the
1499 corresponding ASCII digit code points.
1500
1501 Returns a new Unicode string on success, NULL on failure.
1502*/
1503
Georg Brandlb5503082010-12-05 11:40:48 +00001504#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001505PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1506 Py_UNICODE *s, /* Unicode buffer */
1507 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1508 );
Georg Brandlb5503082010-12-05 11:40:48 +00001509#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1512 as argument instead of a raw buffer and length. This function additionally
1513 transforms spaces to ASCII because this is what the callers in longobject,
1514 floatobject, and complexobject did anyways. */
1515
1516#ifndef Py_LIMITED_API
1517PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1518 PyObject *unicode /* Unicode object */
1519 );
1520#endif
1521
Martin v. Löwis011e8422009-05-05 04:43:17 +00001522/* --- File system encoding ---------------------------------------------- */
1523
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001524/* ParseTuple converter: encode str objects to bytes using
1525 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001526
1527PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1528
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001529/* ParseTuple converter: decode bytes objects to unicode using
1530 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1531
1532PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1533
Victor Stinner77c38622010-05-14 15:58:55 +00001534/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1535 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001536
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001537 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1538 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001539
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001540 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001541*/
1542
1543PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1544 const char *s /* encoded string */
1545 );
1546
Victor Stinner77c38622010-05-14 15:58:55 +00001547/* Decode a string using Py_FileSystemDefaultEncoding
1548 and the "surrogateescape" error handler.
1549
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001550 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1551 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001552*/
1553
Martin v. Löwis011e8422009-05-05 04:43:17 +00001554PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1555 const char *s, /* encoded string */
1556 Py_ssize_t size /* size */
1557 );
1558
Victor Stinnerae6265f2010-05-15 16:27:27 +00001559/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001560 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001561
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001562 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1563 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001564*/
1565
1566PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1567 PyObject *unicode
1568 );
1569
Guido van Rossumd8225182000-03-10 22:33:05 +00001570/* --- Methods & Slots ----------------------------------------------------
1571
1572 These are capable of handling Unicode objects and strings on input
1573 (we refer to them as strings in the descriptions) and return
1574 Unicode objects or integers as apporpriate. */
1575
1576/* Concat two strings giving a new Unicode string. */
1577
Mark Hammond91a681d2002-08-12 07:21:58 +00001578PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001579 PyObject *left, /* Left string */
1580 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001581 );
1582
Walter Dörwald1ab83302007-05-18 17:15:44 +00001583/* Concat two strings and put the result in *pleft
1584 (sets *pleft to NULL on error) */
1585
1586PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 PyObject **pleft, /* Pointer to left string */
1588 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001589 );
1590
1591/* Concat two strings, put the result in *pleft and drop the right object
1592 (sets *pleft to NULL on error) */
1593
1594PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 PyObject **pleft, /* Pointer to left string */
1596 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001597 );
1598
Guido van Rossumd8225182000-03-10 22:33:05 +00001599/* Split a string giving a list of Unicode strings.
1600
1601 If sep is NULL, splitting will be done at all whitespace
1602 substrings. Otherwise, splits occur at the given separator.
1603
1604 At most maxsplit splits will be done. If negative, no limit is set.
1605
1606 Separators are not included in the resulting list.
1607
1608*/
1609
Mark Hammond91a681d2002-08-12 07:21:58 +00001610PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611 PyObject *s, /* String to split */
1612 PyObject *sep, /* String separator */
1613 Py_ssize_t maxsplit /* Maxsplit count */
1614 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001615
1616/* Dito, but split at line breaks.
1617
1618 CRLF is considered to be one line break. Line breaks are not
1619 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620
Mark Hammond91a681d2002-08-12 07:21:58 +00001621PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 PyObject *s, /* String to split */
1623 int keepends /* If true, line end markers are included */
1624 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001625
Thomas Wouters477c8d52006-05-27 19:21:47 +00001626/* Partition a string using a given separator. */
1627
1628PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 PyObject *s, /* String to partition */
1630 PyObject *sep /* String separator */
1631 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001632
1633/* Partition a string using a given separator, searching from the end of the
1634 string. */
1635
1636PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 PyObject *s, /* String to partition */
1638 PyObject *sep /* String separator */
1639 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001640
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001641/* Split a string giving a list of Unicode strings.
1642
1643 If sep is NULL, splitting will be done at all whitespace
1644 substrings. Otherwise, splits occur at the given separator.
1645
1646 At most maxsplit splits will be done. But unlike PyUnicode_Split
1647 PyUnicode_RSplit splits from the end of the string. If negative,
1648 no limit is set.
1649
1650 Separators are not included in the resulting list.
1651
1652*/
1653
1654PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 PyObject *s, /* String to split */
1656 PyObject *sep, /* String separator */
1657 Py_ssize_t maxsplit /* Maxsplit count */
1658 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001659
Guido van Rossumd8225182000-03-10 22:33:05 +00001660/* Translate a string by applying a character mapping table to it and
1661 return the resulting Unicode object.
1662
1663 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001665
1666 Mapping tables may be dictionaries or sequences. Unmapped character
1667 ordinals (ones which cause a LookupError) are left untouched and
1668 are copied as-is.
1669
1670*/
1671
Mark Hammond91a681d2002-08-12 07:21:58 +00001672PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 PyObject *str, /* String */
1674 PyObject *table, /* Translate table */
1675 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001676 );
1677
1678/* Join a sequence of strings using the given separator and return
1679 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680
Mark Hammond91a681d2002-08-12 07:21:58 +00001681PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 PyObject *separator, /* Separator string */
1683 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001684 );
1685
1686/* Return 1 if substr matches str[start:end] at the given tail end, 0
1687 otherwise. */
1688
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690 PyObject *str, /* String */
1691 PyObject *substr, /* Prefix or Suffix string */
1692 Py_ssize_t start, /* Start index */
1693 Py_ssize_t end, /* Stop index */
1694 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001695 );
1696
1697/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001698 given search direction or -1 if not found. -2 is returned in case
1699 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001700
Martin v. Löwis18e16552006-02-15 17:27:45 +00001701PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 PyObject *str, /* String */
1703 PyObject *substr, /* Substring to find */
1704 Py_ssize_t start, /* Start index */
1705 Py_ssize_t end, /* Stop index */
1706 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001707 );
1708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709/* Like PyUnicode_Find, but search for single character only. */
1710PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1711 PyObject *str,
1712 Py_UCS4 ch,
1713 Py_ssize_t start,
1714 Py_ssize_t end,
1715 int direction
1716 );
1717
Barry Warsaw51ac5802000-03-20 16:36:48 +00001718/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001719
Martin v. Löwis18e16552006-02-15 17:27:45 +00001720PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001721 PyObject *str, /* String */
1722 PyObject *substr, /* Substring to count */
1723 Py_ssize_t start, /* Start index */
1724 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001725 );
1726
Barry Warsaw51ac5802000-03-20 16:36:48 +00001727/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001728 and return the resulting Unicode object. */
1729
Mark Hammond91a681d2002-08-12 07:21:58 +00001730PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001731 PyObject *str, /* String */
1732 PyObject *substr, /* Substring to find */
1733 PyObject *replstr, /* Substring to replace */
1734 Py_ssize_t maxcount /* Max. number of replacements to apply;
1735 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001736 );
1737
1738/* Compare two strings and return -1, 0, 1 for less than, equal,
1739 greater than resp. */
1740
Mark Hammond91a681d2002-08-12 07:21:58 +00001741PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 PyObject *left, /* Left string */
1743 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001744 );
1745
Martin v. Löwis5b222132007-06-10 09:51:05 +00001746PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1747 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001748 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001749 );
1750
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001751/* Rich compare two strings and return one of the following:
1752
1753 - NULL in case an exception was raised
1754 - Py_True or Py_False for successfuly comparisons
1755 - Py_NotImplemented in case the type combination is unknown
1756
1757 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1758 case the conversion of the arguments to Unicode fails with a
1759 UnicodeDecodeError.
1760
1761 Possible values for op:
1762
1763 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1764
1765*/
1766
1767PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 PyObject *left, /* Left string */
1769 PyObject *right, /* Right string */
1770 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001771 );
1772
Thomas Wouters7e474022000-07-16 12:04:32 +00001773/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001774 the resulting Unicode string. */
1775
Mark Hammond91a681d2002-08-12 07:21:58 +00001776PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 PyObject *format, /* Format string */
1778 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001779 );
1780
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001781/* Checks whether element is contained in container and return 1/0
1782 accordingly.
1783
1784 element has to coerce to an one element Unicode string. -1 is
1785 returned in case of an error. */
1786
Mark Hammond91a681d2002-08-12 07:21:58 +00001787PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001788 PyObject *container, /* Container string */
1789 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001790 );
1791
Martin v. Löwis47383402007-08-15 07:32:56 +00001792/* Checks whether argument is a valid identifier. */
1793
1794PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1795
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001796#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001797/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001798PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001799 PyUnicodeObject *self,
1800 int striptype,
1801 PyObject *sepobj
1802 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001803#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001804
Eric Smith5807c412008-05-11 21:00:57 +00001805/* Using the current locale, insert the thousands grouping
1806 into the string pointed to by buffer. For the argument descriptions,
1807 see Objects/stringlib/localeutil.h */
1808
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001809#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001810PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1811 Py_ssize_t n_buffer,
1812 Py_UNICODE *digits,
1813 Py_ssize_t n_digits,
1814 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001815#endif
Eric Smith5807c412008-05-11 21:00:57 +00001816
Eric Smitha3b1ac82009-04-03 14:45:06 +00001817/* Using explicit passed-in values, insert the thousands grouping
1818 into the string pointed to by buffer. For the argument descriptions,
1819 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001820#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1822 int kind,
1823 void *buffer,
1824 Py_ssize_t n_buffer,
1825 void *digits,
1826 Py_ssize_t n_digits,
1827 Py_ssize_t min_width,
1828 const char *grouping,
1829 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001830#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001831/* === Characters Type APIs =============================================== */
1832
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001833/* Helper array used by Py_UNICODE_ISSPACE(). */
1834
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001835#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001836PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1837
Guido van Rossumd8225182000-03-10 22:33:05 +00001838/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001840
1841 These APIs are implemented in Objects/unicodectype.c.
1842
1843*/
1844
Mark Hammond91a681d2002-08-12 07:21:58 +00001845PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001846 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001847 );
1848
Mark Hammond91a681d2002-08-12 07:21:58 +00001849PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001850 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001851 );
1852
Mark Hammond91a681d2002-08-12 07:21:58 +00001853PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001854 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001855 );
1856
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001857PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001858 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001859 );
1860
1861PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001862 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001863 );
1864
Mark Hammond91a681d2002-08-12 07:21:58 +00001865PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001866 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001867 );
1868
Mark Hammond91a681d2002-08-12 07:21:58 +00001869PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001870 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001871 );
1872
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001873PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1874 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001875 );
1876
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001877PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1878 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001879 );
1880
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001881PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1882 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001883 );
1884
Mark Hammond91a681d2002-08-12 07:21:58 +00001885PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001886 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001887 );
1888
Mark Hammond91a681d2002-08-12 07:21:58 +00001889PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001890 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001891 );
1892
Mark Hammond91a681d2002-08-12 07:21:58 +00001893PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001894 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001895 );
1896
Mark Hammond91a681d2002-08-12 07:21:58 +00001897PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001898 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001899 );
1900
Mark Hammond91a681d2002-08-12 07:21:58 +00001901PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001902 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001903 );
1904
Mark Hammond91a681d2002-08-12 07:21:58 +00001905PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001906 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001907 );
1908
Georg Brandl559e5d72008-06-11 18:37:52 +00001909PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001910 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001911 );
1912
Mark Hammond91a681d2002-08-12 07:21:58 +00001913PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001914 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001915 );
1916
Victor Stinneref8d95c2010-08-16 22:03:11 +00001917PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1918 const Py_UNICODE *u
1919 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001920
1921PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001922 Py_UNICODE *s1,
1923 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001924
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001925PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1926 Py_UNICODE *s1, const Py_UNICODE *s2);
1927
Martin v. Löwis5b222132007-06-10 09:51:05 +00001928PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001929 Py_UNICODE *s1,
1930 const Py_UNICODE *s2,
1931 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001932
1933PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001934 const Py_UNICODE *s1,
1935 const Py_UNICODE *s2
1936 );
1937
1938PyAPI_FUNC(int) Py_UNICODE_strncmp(
1939 const Py_UNICODE *s1,
1940 const Py_UNICODE *s2,
1941 size_t n
1942 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001943
1944PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001945 const Py_UNICODE *s,
1946 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001947 );
1948
Victor Stinner331ea922010-08-10 16:37:20 +00001949PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001950 const Py_UNICODE *s,
1951 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001952 );
1953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954PyAPI_FUNC(size_t) Py_UCS4_strlen(
1955 const Py_UCS4 *u
1956 );
1957
1958PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1959 Py_UCS4 *s1,
1960 const Py_UCS4 *s2);
1961
1962PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1963 Py_UCS4 *s1, const Py_UCS4 *s2);
1964
1965PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1966 Py_UCS4 *s1,
1967 const Py_UCS4 *s2,
1968 size_t n);
1969
1970PyAPI_FUNC(int) Py_UCS4_strcmp(
1971 const Py_UCS4 *s1,
1972 const Py_UCS4 *s2
1973 );
1974
1975PyAPI_FUNC(int) Py_UCS4_strncmp(
1976 const Py_UCS4 *s1,
1977 const Py_UCS4 *s2,
1978 size_t n
1979 );
1980
1981PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1982 const Py_UCS4 *s,
1983 Py_UCS4 c
1984 );
1985
1986PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1987 const Py_UCS4 *s,
1988 Py_UCS4 c
1989 );
1990
Victor Stinner71133ff2010-09-01 23:43:53 +00001991/* Create a copy of a unicode string ending with a nul character. Return NULL
1992 and raise a MemoryError exception on memory allocation failure, otherwise
1993 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1994
Victor Stinner46408602010-09-03 16:18:00 +00001995PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001996 PyObject *unicode
1997 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001998#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001999
Guido van Rossumd8225182000-03-10 22:33:05 +00002000#ifdef __cplusplus
2001}
2002#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002003#endif /* !Py_UNICODEOBJECT_H */