blob: 3dee11f3441b427ad8d915fa2fdca49cec03a8fe [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200227 * ascii = 0
Victor Stinner85041a52011-10-03 14:42:39 +0200228 * utf8 != data
Victor Stinnera41463c2011-10-04 01:05:08 +0200229 * utf8_length = 0 if utf8 is NULL
230 * wstr is shared with data and wstr_length=length
231 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
232 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
233 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200234
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200235 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200236
237 * structure = PyUnicodeObject
238 * kind = PyUnicode_WCHAR_KIND
239 * compact = 0
240 * ready = 0
241 * wstr is not NULL
242 * data.any is NULL
243 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200244 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200245 * interned = SSTATE_NOT_INTERNED
Victor Stinnera3b334d2011-10-03 13:53:37 +0200246 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200247
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200248 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200249
250 * structure = PyUnicodeObject structure
251 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
252 PyUnicode_4BYTE_KIND
253 * compact = 0
254 * ready = 1
255 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200256 * utf8 is shared and utf8_length = length with data.any if ascii = 1
257 * utf8_length = 0 if utf8 is NULL
258 * wstr is shared and wstr_length = length with data.any
259 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
260 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
261 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200262
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200263 Compact strings use only one memory block (structure + characters),
264 whereas legacy strings use one block for the structure and one block
265 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200266
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200267 Legacy strings are created by PyUnicode_FromUnicode() and
268 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
269 when PyUnicode_READY() is called.
270
271 See also _PyUnicode_CheckConsistency().
272 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000273 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000275 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200276 struct {
277 /*
278 SSTATE_NOT_INTERNED (0)
279 SSTATE_INTERNED_MORTAL (1)
280 SSTATE_INTERNED_IMMORTAL (2)
281
282 If interned != SSTATE_NOT_INTERNED, the two references from the
283 dictionary to this object are *not* counted in ob_refcnt.
284 */
285 unsigned int interned:2;
286 /* Character size:
287
288 PyUnicode_WCHAR_KIND (0): wchar_t*
289 PyUnicode_1BYTE_KIND (1): Py_UCS1*
290 PyUnicode_2BYTE_KIND (2): Py_UCS2*
291 PyUnicode_4BYTE_KIND (3): Py_UCS4*
292 */
293 unsigned int kind:2;
294 /* Compact is with respect to the allocation scheme. Compact unicode
295 objects only require one memory block while non-compact objects use
296 one block for the PyUnicodeObject struct and another for its data
297 buffer. */
298 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200299 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
300 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
301 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200302 unsigned int ascii:1;
303 /* The ready flag indicates whether the object layout is initialized
304 completely. This means that this is either a compact object, or
305 the data pointer is filled out. The bit is redundant, and helps
306 to minimize the test in PyUnicode_IS_READY(). */
307 unsigned int ready:1;
308 } state;
309 wchar_t *wstr; /* wchar_t representation (null-terminated) */
310} PyASCIIObject;
311
312/* Non-ASCII strings allocated through PyUnicode_New use the
313 PyCompactUnicodeOject structure. state.compact is set, and the data
314 immediately follow the structure. */
315typedef struct {
316 PyASCIIObject _base;
317 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
318 * terminating \0. */
319 char *utf8; /* UTF-8 representation (null-terminated) */
320 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
321 * surrogates count as two code points. */
322} PyCompactUnicodeObject;
323
324/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
325 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200326 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200327typedef struct {
328 PyCompactUnicodeObject _base;
329 union {
330 void *any;
331 Py_UCS1 *latin1;
332 Py_UCS2 *ucs2;
333 Py_UCS4 *ucs4;
334 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000335} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000336#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000337
Mark Hammond91a681d2002-08-12 07:21:58 +0000338PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000339PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000340
Thomas Wouters27d517b2007-02-25 20:39:11 +0000341#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000342 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
343#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000344
345/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000346#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347
348#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200349 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 ((PyASCIIObject*)op)->length : \
351 ((PyCompactUnicodeObject*)op)->wstr_length)
352
353/* Returns the deprecated Py_UNICODE representation's size in code units
354 (this includes surrogate pairs as 2 units).
355 If the Py_UNICODE representation is not available, it will be computed
356 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
357
Guido van Rossumd8225182000-03-10 22:33:05 +0000358#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 (assert(PyUnicode_Check(op)), \
360 (((PyASCIIObject *)(op))->wstr) ? \
361 PyUnicode_WSTR_LENGTH(op) : \
362 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
363 PyUnicode_WSTR_LENGTH(op)))
364
Guido van Rossumd8225182000-03-10 22:33:05 +0000365#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
367
368/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
369 representation on demand. Using this macro is very inefficient now,
370 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
371 use PyUnicode_WRITE() and PyUnicode_READ(). */
372
Guido van Rossumd8225182000-03-10 22:33:05 +0000373#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200374 (assert(PyUnicode_Check(op)), \
375 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
376 PyUnicode_AsUnicode((PyObject *)(op)))
377
Guido van Rossumd8225182000-03-10 22:33:05 +0000378#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200379 ((const char *)(PyUnicode_AS_UNICODE(op)))
380
381
382/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
383
384/* Values for PyUnicodeObject.state: */
385
386/* Interning state. */
387#define SSTATE_NOT_INTERNED 0
388#define SSTATE_INTERNED_MORTAL 1
389#define SSTATE_INTERNED_IMMORTAL 2
390
Victor Stinnera3b334d2011-10-03 13:53:37 +0200391/* Return true if the string contains only ASCII characters, or 0 if not. The
392 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
393 or Ready calls are performed. */
394#define PyUnicode_IS_ASCII(op) \
395 (((PyASCIIObject*)op)->state.ascii)
396
397/* Return true if the string is compact or 0 if not.
398 No type checks or Ready calls are performed. */
399#define PyUnicode_IS_COMPACT(op) \
400 (((PyASCIIObject*)(op))->state.compact)
401
402/* Return true if the string is a compact ASCII string (use PyASCIIObject
403 structure), or 0 if not. No type checks or Ready calls are performed. */
404#define PyUnicode_IS_COMPACT_ASCII(op) \
405 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406
407/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200408 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409 has not been called yet. */
410#define PyUnicode_WCHAR_KIND 0
411
412/* Return values of the PyUnicode_KIND() macro: */
413
414#define PyUnicode_1BYTE_KIND 1
415#define PyUnicode_2BYTE_KIND 2
416#define PyUnicode_4BYTE_KIND 3
417
418
419/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200420 this can be 1, 2 or 4.
421
422 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423#define PyUnicode_CHARACTER_SIZE(op) \
424 (1 << (PyUnicode_KIND(op) - 1))
425
426/* Return pointers to the canonical representation casted as unsigned char,
427 Py_UCS2, or Py_UCS4 for direct character access.
428 No checks are performed, use PyUnicode_CHARACTER_SIZE or
429 PyUnicode_KIND() before to ensure these will work correctly. */
430
431#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
432#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
433#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
434
Victor Stinner157f83f2011-09-28 21:41:31 +0200435/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436#define PyUnicode_KIND(op) \
437 (assert(PyUnicode_Check(op)), \
438 assert(PyUnicode_IS_READY(op)), \
439 ((PyASCIIObject *)(op))->state.kind)
440
Victor Stinner157f83f2011-09-28 21:41:31 +0200441/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#define _PyUnicode_COMPACT_DATA(op) \
443 (PyUnicode_IS_COMPACT_ASCII(op) ? \
444 ((void*)((PyASCIIObject*)(op) + 1)) : \
445 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
446
447#define _PyUnicode_NONCOMPACT_DATA(op) \
448 (assert(((PyUnicodeObject*)(op))->data.any), \
449 ((((PyUnicodeObject *)(op))->data.any)))
450
451#define PyUnicode_DATA(op) \
452 (assert(PyUnicode_Check(op)), \
453 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
454 _PyUnicode_NONCOMPACT_DATA(op))
455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200457 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458
Victor Stinner4584a5b2011-10-01 02:39:37 +0200459 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
461
462/* In the access macros below, "kind" may be evaluated more than once.
463 All other macro parameters are evaluated exactly once, so it is safe
464 to put side effects into them (such as increasing the index). */
465
466/* Write into the canonical representation, this macro does not do any sanity
467 checks and is intended for usage in loops. The caller should cache the
468 kind and data pointers optained form other macro calls.
469 index is the index in the string (starts at 0) and value is the new
470 code point value which shoule be written to that location. */
471#define PyUnicode_WRITE(kind, data, index, value) \
472 do { \
473 switch ((kind)) { \
474 case PyUnicode_1BYTE_KIND: { \
475 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
476 break; \
477 } \
478 case PyUnicode_2BYTE_KIND: { \
479 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
480 break; \
481 } \
482 default: { \
483 assert((kind) == PyUnicode_4BYTE_KIND); \
484 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
485 } \
486 } \
487 } while (0)
488
489/* Read a code point form the string's canonical representation. No checks
490 or ready calls are performed. */
491#define PyUnicode_READ(kind, data, index) \
492 ((Py_UCS4) \
493 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200494 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200495 ((kind) == PyUnicode_2BYTE_KIND ? \
496 ((const Py_UCS2 *)(data))[(index)] : \
497 ((const Py_UCS4 *)(data))[(index)] \
498 ) \
499 ))
500
501/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
502 calls PyUnicode_KIND() and might call it twice. For single reads, use
503 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
504 cache kind and use PyUnicode_READ instead. */
505#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200506 (assert(PyUnicode_Check(unicode)), \
507 assert(PyUnicode_IS_READY(unicode)), \
508 (Py_UCS4) \
509 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
510 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
511 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
512 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
513 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
514 ) \
515 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200516
517/* Returns the length of the unicode string. The caller has to make sure that
518 the string has it's canonical representation set before calling
519 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
520#define PyUnicode_GET_LENGTH(op) \
521 (assert(PyUnicode_Check(op)), \
522 assert(PyUnicode_IS_READY(op)), \
523 ((PyASCIIObject *)(op))->length)
524
525
526/* Fast check to determine whether an object is ready. Equivalent to
527 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
528
529#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
530
Victor Stinnera3b334d2011-10-03 13:53:37 +0200531/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200533 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534 Returns 0 on success and -1 on errors. */
535#define PyUnicode_READY(op) \
536 (assert(PyUnicode_Check(op)), \
537 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200538 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200540/* Return a maximum character value which is suitable for creating another
541 string based on op. This is always an approximation but more efficient
542 than interating over the string. */
543#define PyUnicode_MAX_CHAR_VALUE(op) \
544 (assert(PyUnicode_IS_READY(op)), \
545 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
546 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
547 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
548 (0x7fU) : (0xffU) \
549 ) : \
550 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
551 (0xffffU) : (0x10ffffU) \
552 ))))
553
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000554#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000555
556/* --- Constants ---------------------------------------------------------- */
557
558/* This Unicode character will be used as replacement character during
559 decoding if the errors argument is set to "replace". Note: the
560 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
561 Unicode 3.0. */
562
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200563#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000564
565/* === Public API ========================================================= */
566
567/* --- Plain Py_UNICODE --------------------------------------------------- */
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569/* With PEP 393, this is the recommended way to allocate a new unicode object.
570 This function will allocate the object and its buffer in a single memory
571 block. Objects created using this function are not resizable. */
572#ifndef Py_LIMITED_API
573PyAPI_FUNC(PyObject*) PyUnicode_New(
574 Py_ssize_t size, /* Number of code points in the new string */
575 Py_UCS4 maxchar /* maximum code point value in the string */
576 );
577#endif
578
Victor Stinnerd8f65102011-09-29 19:43:17 +0200579/* Initializes the canonical string representation from a the deprecated
580 wstr/Py_UNICODE representation. This function is used to convert Unicode
581 objects which were created using the old API to the new flexible format
582 introduced with PEP 393.
583
584 Don't call this function directly, use the public PyUnicode_READY() macro
585 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200586#ifndef Py_LIMITED_API
587PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200588 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589 );
590#endif
591
Victor Stinner034f6cf2011-09-30 02:26:44 +0200592/* Get a copy of a Unicode string. */
593PyAPI_FUNC(PyObject*) PyUnicode_Copy(
594 PyObject *unicode
595 );
596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200597/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200598 character conversion when necessary and falls back to memcpy if possible.
599
Victor Stinnera0702ab2011-09-29 14:14:38 +0200600 Fail if to is too small (smaller than how_many or smaller than
601 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
602 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200603
604 Return the number of written character, or return -1 and raise an exception
605 on error.
606
607 Pseudo-code:
608
609 how_many = min(how_many, len(from) - from_start)
610 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
611 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200612
613 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200616PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617 PyObject *to,
618 Py_ssize_t to_start,
619 PyObject *from,
620 Py_ssize_t from_start,
621 Py_ssize_t how_many
622 );
623#endif
624
Guido van Rossumd8225182000-03-10 22:33:05 +0000625/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000627
628 u may be NULL which causes the contents to be undefined. It is the
629 user's responsibility to fill in the needed data afterwards. Note
630 that modifying the Unicode object contents after construction is
631 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000632
633 The buffer is copied into the new object. */
634
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000635#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000636PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000637 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000638 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000639 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000640#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000641
Georg Brandl952867a2010-06-27 10:17:12 +0000642/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000643PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000644 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000645 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646 );
647
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000648/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000650PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000651 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000652 );
653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654#ifndef Py_LIMITED_API
655PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
656 int kind,
657 const void *buffer,
658 Py_ssize_t size);
659#endif
660
661PyAPI_FUNC(PyObject*) PyUnicode_Substring(
662 PyObject *str,
663 Py_ssize_t start,
664 Py_ssize_t end);
665
666/* Copy the string into a UCS4 buffer including the null character is copy_null
667 is set. Return NULL and raise an exception on error. Raise a ValueError if
668 the buffer is smaller than the string. Return buffer on success.
669
670 buflen is the length of the buffer in (Py_UCS4) characters. */
671PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
672 PyObject *unicode,
673 Py_UCS4* buffer,
674 Py_ssize_t buflen,
675 int copy_null);
676
677/* Copy the string into a UCS4 buffer. A new buffer is allocated using
678 * PyMem_Malloc; if this fails, NULL is returned with a memory error
679 exception set. */
680PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
681
Guido van Rossumd8225182000-03-10 22:33:05 +0000682/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683 Py_UNICODE buffer.
684 If the wchar_t/Py_UNICODE representation is not yet available, this
685 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000686
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000687#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000688PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000690 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000691#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200693/* Return a read-only pointer to the Unicode object's internal
694 Py_UNICODE buffer and save the length at size.
695 If the wchar_t/Py_UNICODE representation is not yet available, this
696 function will calculate it. */
697
698#ifndef Py_LIMITED_API
699PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
700 PyObject *unicode, /* Unicode object */
701 Py_ssize_t *size /* location where to save the length */
702 );
703#endif
704
Guido van Rossumd8225182000-03-10 22:33:05 +0000705/* Get the length of the Unicode object. */
706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
708 PyObject *unicode
709);
710
Victor Stinner157f83f2011-09-28 21:41:31 +0200711/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 string representation. */
713
Martin v. Löwis18e16552006-02-15 17:27:45 +0000714PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000716 );
717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718/* Read a character from the string. */
719
720PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
721 PyObject *unicode,
722 Py_ssize_t index
723 );
724
725/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200726 PyUnicode_New, must not be shared, and must not have been hashed yet.
727
728 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729
730PyAPI_FUNC(int) PyUnicode_WriteChar(
731 PyObject *unicode,
732 Py_ssize_t index,
733 Py_UCS4 character
734 );
735
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000736#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000737/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000738PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000739#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000740
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200741/* Resize an Unicode object allocated by the legacy API (e.g.
742 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
743 PyUnicode_New) cannot be resized by this function.
744
745 The length is a number of Py_UNICODE characters (and not the number of code
746 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000747
748 *unicode is modified to point to the new (resized) object and 0
749 returned on success.
750
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200751 If the refcount on the object is 1, the function resizes the string in
752 place, which is usually faster than allocating a new string (and copy
753 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000754
755 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200756 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000757
Mark Hammond91a681d2002-08-12 07:21:58 +0000758PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 PyObject **unicode, /* Pointer to the Unicode object */
760 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000761 );
762
Guido van Rossumd8225182000-03-10 22:33:05 +0000763/* Coerce obj to an Unicode object and return a reference with
764 *incremented* refcount.
765
766 Coercion is done in the following way:
767
Georg Brandl952867a2010-06-27 10:17:12 +0000768 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000769 under the assumptions that they contain data using the UTF-8
770 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000771
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000772 2. All other objects (including Unicode objects) raise an
773 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000774
775 The API returns NULL in case of an error. The caller is responsible
776 for decref'ing the returned objects.
777
778*/
779
Mark Hammond91a681d2002-08-12 07:21:58 +0000780PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000782 const char *encoding, /* encoding */
783 const char *errors /* error handling */
784 );
785
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000786/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000787 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000789 Unicode objects are passed back as-is (subclasses are converted to
790 true Unicode objects), all other objects are delegated to
791 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000792 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000793
794 The API returns NULL in case of an error. The caller is responsible
795 for decref'ing the returned objects.
796
797*/
798
Mark Hammond91a681d2002-08-12 07:21:58 +0000799PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000801 );
802
Victor Stinner1205f272010-09-11 00:54:47 +0000803PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
804 const char *format, /* ASCII-encoded string */
805 va_list vargs
806 );
807PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
808 const char *format, /* ASCII-encoded string */
809 ...
810 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000811
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000812#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000813/* Format the object based on the format_spec, as defined in PEP 3101
814 (Advanced String Formatting). */
815PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816 PyObject *format_spec,
817 Py_ssize_t start,
818 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000819#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000820
Walter Dörwald16807132007-05-25 13:52:07 +0000821PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
822PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000823PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
824 const char *u /* UTF-8 encoded string */
825 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000826#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000827PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000828#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000829
830/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#define PyUnicode_CHECK_INTERNED(op) \
832 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000833
Guido van Rossumd8225182000-03-10 22:33:05 +0000834/* --- wchar_t support for platforms which support it --------------------- */
835
836#ifdef HAVE_WCHAR_H
837
Georg Brandl952867a2010-06-27 10:17:12 +0000838/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000839 size.
840
841 The buffer is copied into the new object. */
842
Mark Hammond91a681d2002-08-12 07:21:58 +0000843PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000844 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000845 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000846 );
847
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000848/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000849 most size wchar_t characters are copied.
850
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000851 Note that the resulting wchar_t string may or may not be
852 0-terminated. It is the responsibility of the caller to make sure
853 that the wchar_t string is 0-terminated in case this is required by
854 the application.
855
856 Returns the number of wchar_t characters copied (excluding a
857 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000858 error. */
859
Martin v. Löwis18e16552006-02-15 17:27:45 +0000860PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000861 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000862 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000863 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000864 );
865
Victor Stinner137c34c2010-09-29 10:25:54 +0000866/* Convert the Unicode object to a wide character string. The output string
867 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200868 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000869
870 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
871 on success. On error, returns NULL, *size is undefined and raises a
872 MemoryError. */
873
874PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000875 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000876 Py_ssize_t *size /* number of characters of the result */
877 );
878
Victor Stinner9f789e72011-10-01 03:57:28 +0200879#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200881#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882
Guido van Rossumd8225182000-03-10 22:33:05 +0000883#endif
884
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000885/* --- Unicode ordinals --------------------------------------------------- */
886
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887/* Create a Unicode Object from the given Unicode code point ordinal.
888
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000889 The ordinal must be in range(0x10000) on narrow Python builds
890 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
891 raised in case it is not.
892
893*/
894
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000895PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000896
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000897/* --- Free-list management ----------------------------------------------- */
898
899/* Clear the free list used by the Unicode implementation.
900
901 This can be used to release memory used for objects on the free
902 list back to the Python memory allocator.
903
904*/
905
906PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
907
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000909
910 Many of these APIs take two arguments encoding and errors. These
911 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000912 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000913
Georg Brandl952867a2010-06-27 10:17:12 +0000914 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000915
916 Error handling is set by errors which may also be set to NULL
917 meaning to use the default handling defined for the codec. Default
918 error handling for all builtin codecs is "strict" (ValueErrors are
919 raised).
920
921 The codecs all use a similar interface. Only deviation from the
922 generic ones are documented.
923
924*/
925
Fred Drakecb093fe2000-05-09 19:51:53 +0000926/* --- Manage the default encoding ---------------------------------------- */
927
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000928/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000929 Unicode object unicode and the size of the encoded representation
930 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000931
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000932 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000933
Victor Stinner157f83f2011-09-28 21:41:31 +0200934 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 and subsequent calls will return the same string. The memory is relased
936 when the unicodeobject is deallocated.
937
938 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
939 support the previous internal function with the same behaviour.
940
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000941 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000942 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000943
944 *** If you need to access the Unicode object as UTF-8 bytes string,
945 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000946*/
947
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000948#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000950 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000951 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000953#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000954
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000955/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000956 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
959 in the unicodeobject.
960
961 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
962 support the previous internal function with the same behaviour.
963
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000964 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000965 extracted from the returned data.
966
967 *** This API is for interpreter INTERNAL USE ONLY and will likely
968 *** be removed or changed for Python 3.1.
969
970 *** If you need to access the Unicode object as UTF-8 bytes string,
971 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000972
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000973*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000974
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000975#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
977#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000978#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000979
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000980/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000981
Mark Hammond91a681d2002-08-12 07:21:58 +0000982PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000983
Guido van Rossumd8225182000-03-10 22:33:05 +0000984/* --- Generic Codecs ----------------------------------------------------- */
985
986/* Create a Unicode object by decoding the encoded string s of the
987 given size. */
988
Mark Hammond91a681d2002-08-12 07:21:58 +0000989PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000990 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000991 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000992 const char *encoding, /* encoding */
993 const char *errors /* error handling */
994 );
995
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000996/* Decode a Unicode object unicode and return the result as Python
997 object. */
998
999PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 PyObject *unicode, /* Unicode object */
1001 const char *encoding, /* encoding */
1002 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001003 );
1004
1005/* Decode a Unicode object unicode and return the result as Unicode
1006 object. */
1007
1008PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 PyObject *unicode, /* Unicode object */
1010 const char *encoding, /* encoding */
1011 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001012 );
1013
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001015 Python string object. */
1016
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001017#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001018PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001019 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001020 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 const char *encoding, /* encoding */
1022 const char *errors /* error handling */
1023 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001024#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001025
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001026/* Encodes a Unicode object and returns the result as Python
1027 object. */
1028
1029PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 PyObject *unicode, /* Unicode object */
1031 const char *encoding, /* encoding */
1032 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001033 );
1034
Guido van Rossumd8225182000-03-10 22:33:05 +00001035/* Encodes a Unicode object and returns the result as Python string
1036 object. */
1037
Mark Hammond91a681d2002-08-12 07:21:58 +00001038PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001039 PyObject *unicode, /* Unicode object */
1040 const char *encoding, /* encoding */
1041 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 );
1043
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001044/* Encodes a Unicode object and returns the result as Unicode
1045 object. */
1046
1047PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 PyObject *unicode, /* Unicode object */
1049 const char *encoding, /* encoding */
1050 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001051 );
1052
1053/* Build an encoding map. */
1054
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001055PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1056 PyObject* string /* 256 character map */
1057 );
1058
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001059/* --- UTF-7 Codecs ------------------------------------------------------- */
1060
Mark Hammond91a681d2002-08-12 07:21:58 +00001061PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 const char *string, /* UTF-7 encoded string */
1063 Py_ssize_t length, /* size of string */
1064 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001065 );
1066
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001067PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001068 const char *string, /* UTF-7 encoded string */
1069 Py_ssize_t length, /* size of string */
1070 const char *errors, /* error handling */
1071 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001072 );
1073
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001074#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001075PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001076 const Py_UNICODE *data, /* Unicode char buffer */
1077 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1078 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1079 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1080 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001082#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001083
Guido van Rossumd8225182000-03-10 22:33:05 +00001084/* --- UTF-8 Codecs ------------------------------------------------------- */
1085
Mark Hammond91a681d2002-08-12 07:21:58 +00001086PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 const char *string, /* UTF-8 encoded string */
1088 Py_ssize_t length, /* size of string */
1089 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001090 );
1091
Walter Dörwald69652032004-09-07 20:24:22 +00001092PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001093 const char *string, /* UTF-8 encoded string */
1094 Py_ssize_t length, /* size of string */
1095 const char *errors, /* error handling */
1096 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001097 );
1098
Mark Hammond91a681d2002-08-12 07:21:58 +00001099PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001101 );
1102
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001103#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1105 PyObject *unicode,
1106 const char *errors);
1107
Mark Hammond91a681d2002-08-12 07:21:58 +00001108PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 const Py_UNICODE *data, /* Unicode char buffer */
1110 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1111 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001112 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001113#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001114
Walter Dörwald41980ca2007-08-16 21:55:45 +00001115/* --- UTF-32 Codecs ------------------------------------------------------ */
1116
1117/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1118 the corresponding Unicode object.
1119
1120 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001122
1123 If byteorder is non-NULL, the decoder starts decoding using the
1124 given byte order:
1125
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 *byteorder == -1: little endian
1127 *byteorder == 0: native order
1128 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001129
1130 In native mode, the first four bytes of the stream are checked for a
1131 BOM mark. If found, the BOM mark is analysed, the byte order
1132 adjusted and the BOM skipped. In the other modes, no BOM mark
1133 interpretation is done. After completion, *byteorder is set to the
1134 current byte order at the end of input data.
1135
1136 If byteorder is NULL, the codec starts in native order mode.
1137
1138*/
1139
1140PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 const char *string, /* UTF-32 encoded string */
1142 Py_ssize_t length, /* size of string */
1143 const char *errors, /* error handling */
1144 int *byteorder /* pointer to byteorder to use
1145 0=native;-1=LE,1=BE; updated on
1146 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001147 );
1148
1149PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001150 const char *string, /* UTF-32 encoded string */
1151 Py_ssize_t length, /* size of string */
1152 const char *errors, /* error handling */
1153 int *byteorder, /* pointer to byteorder to use
1154 0=native;-1=LE,1=BE; updated on
1155 exit */
1156 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001157 );
1158
1159/* Returns a Python string using the UTF-32 encoding in native byte
1160 order. The string always starts with a BOM mark. */
1161
1162PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001164 );
1165
1166/* Returns a Python string object holding the UTF-32 encoded value of
1167 the Unicode data.
1168
1169 If byteorder is not 0, output is written according to the following
1170 byte order:
1171
1172 byteorder == -1: little endian
1173 byteorder == 0: native byte order (writes a BOM mark)
1174 byteorder == 1: big endian
1175
1176 If byteorder is 0, the output string will always start with the
1177 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1178 prepended.
1179
1180*/
1181
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001182#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001183PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001184 const Py_UNICODE *data, /* Unicode char buffer */
1185 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1186 const char *errors, /* error handling */
1187 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001188 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001189#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001190
Guido van Rossumd8225182000-03-10 22:33:05 +00001191/* --- UTF-16 Codecs ------------------------------------------------------ */
1192
Guido van Rossum9e896b32000-04-05 20:11:21 +00001193/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001194 the corresponding Unicode object.
1195
1196 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001198
1199 If byteorder is non-NULL, the decoder starts decoding using the
1200 given byte order:
1201
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 *byteorder == -1: little endian
1203 *byteorder == 0: native order
1204 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001205
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001206 In native mode, the first two bytes of the stream are checked for a
1207 BOM mark. If found, the BOM mark is analysed, the byte order
1208 adjusted and the BOM skipped. In the other modes, no BOM mark
1209 interpretation is done. After completion, *byteorder is set to the
1210 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001211
1212 If byteorder is NULL, the codec starts in native order mode.
1213
1214*/
1215
Mark Hammond91a681d2002-08-12 07:21:58 +00001216PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 const char *string, /* UTF-16 encoded string */
1218 Py_ssize_t length, /* size of string */
1219 const char *errors, /* error handling */
1220 int *byteorder /* pointer to byteorder to use
1221 0=native;-1=LE,1=BE; updated on
1222 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001223 );
1224
Walter Dörwald69652032004-09-07 20:24:22 +00001225PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 const char *string, /* UTF-16 encoded string */
1227 Py_ssize_t length, /* size of string */
1228 const char *errors, /* error handling */
1229 int *byteorder, /* pointer to byteorder to use
1230 0=native;-1=LE,1=BE; updated on
1231 exit */
1232 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001233 );
1234
Guido van Rossumd8225182000-03-10 22:33:05 +00001235/* Returns a Python string using the UTF-16 encoding in native byte
1236 order. The string always starts with a BOM mark. */
1237
Mark Hammond91a681d2002-08-12 07:21:58 +00001238PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001240 );
1241
1242/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001243 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001244
1245 If byteorder is not 0, output is written according to the following
1246 byte order:
1247
1248 byteorder == -1: little endian
1249 byteorder == 0: native byte order (writes a BOM mark)
1250 byteorder == 1: big endian
1251
1252 If byteorder is 0, the output string will always start with the
1253 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1254 prepended.
1255
1256 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1257 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001258 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001259
1260*/
1261
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001262#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001263PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 const Py_UNICODE *data, /* Unicode char buffer */
1265 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1266 const char *errors, /* error handling */
1267 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001268 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001269#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001270
1271/* --- Unicode-Escape Codecs ---------------------------------------------- */
1272
Mark Hammond91a681d2002-08-12 07:21:58 +00001273PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 const char *string, /* Unicode-Escape encoded string */
1275 Py_ssize_t length, /* size of string */
1276 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001277 );
1278
Mark Hammond91a681d2002-08-12 07:21:58 +00001279PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001281 );
1282
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001283#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001284PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 const Py_UNICODE *data, /* Unicode char buffer */
1286 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001287 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001288#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001289
1290/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1291
Mark Hammond91a681d2002-08-12 07:21:58 +00001292PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001293 const char *string, /* Raw-Unicode-Escape encoded string */
1294 Py_ssize_t length, /* size of string */
1295 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001296 );
1297
Mark Hammond91a681d2002-08-12 07:21:58 +00001298PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001300 );
1301
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001302#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001303PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 const Py_UNICODE *data, /* Unicode char buffer */
1305 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001306 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001307#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001308
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001309/* --- Unicode Internal Codec ---------------------------------------------
1310
1311 Only for internal use in _codecsmodule.c */
1312
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001313#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001314PyObject *_PyUnicode_DecodeUnicodeInternal(
1315 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001317 const char *errors
1318 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001319#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001320
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001322
1323 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1324
1325*/
1326
Mark Hammond91a681d2002-08-12 07:21:58 +00001327PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 const char *string, /* Latin-1 encoded string */
1329 Py_ssize_t length, /* size of string */
1330 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001331 );
1332
Mark Hammond91a681d2002-08-12 07:21:58 +00001333PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001335 );
1336
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001337#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1339 PyObject* unicode,
1340 const char* errors);
1341
Mark Hammond91a681d2002-08-12 07:21:58 +00001342PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 const Py_UNICODE *data, /* Unicode char buffer */
1344 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1345 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001346 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001347#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001348
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001350
1351 Only 7-bit ASCII data is excepted. All other codes generate errors.
1352
1353*/
1354
Mark Hammond91a681d2002-08-12 07:21:58 +00001355PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 const char *string, /* ASCII encoded string */
1357 Py_ssize_t length, /* size of string */
1358 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001359 );
1360
Mark Hammond91a681d2002-08-12 07:21:58 +00001361PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001363 );
1364
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001365#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1367 PyObject* unicode,
1368 const char* errors);
1369
Mark Hammond91a681d2002-08-12 07:21:58 +00001370PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 const Py_UNICODE *data, /* Unicode char buffer */
1372 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1373 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001374 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001375#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001376
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001380
1381 Decoding mappings must map single string characters to single
1382 Unicode characters, integers (which are then interpreted as Unicode
1383 ordinals) or None (meaning "undefined mapping" and causing an
1384 error).
1385
1386 Encoding mappings must map single Unicode characters to single
1387 string characters, integers (which are then interpreted as Latin-1
1388 ordinals) or None (meaning "undefined mapping" and causing an
1389 error).
1390
1391 If a character lookup fails with a LookupError, the character is
1392 copied as-is meaning that its ordinal value will be interpreted as
1393 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1394 to contain those mappings which map characters to different code
1395 points.
1396
1397*/
1398
Mark Hammond91a681d2002-08-12 07:21:58 +00001399PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 const char *string, /* Encoded string */
1401 Py_ssize_t length, /* size of string */
1402 PyObject *mapping, /* character mapping
1403 (char ordinal -> unicode ordinal) */
1404 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001405 );
1406
Mark Hammond91a681d2002-08-12 07:21:58 +00001407PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 PyObject *unicode, /* Unicode object */
1409 PyObject *mapping /* character mapping
1410 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001411 );
1412
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001413#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001414PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 const Py_UNICODE *data, /* Unicode char buffer */
1416 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1417 PyObject *mapping, /* character mapping
1418 (unicode ordinal -> char ordinal) */
1419 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001420 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001421#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001422
1423/* Translate a Py_UNICODE buffer of the given length by applying a
1424 character mapping table to it and return the resulting Unicode
1425 object.
1426
1427 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001429
1430 Mapping tables may be dictionaries or sequences. Unmapped character
1431 ordinals (ones which cause a LookupError) are left untouched and
1432 are copied as-is.
1433
1434*/
1435
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001436#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001437PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 const Py_UNICODE *data, /* Unicode char buffer */
1439 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1440 PyObject *table, /* Translate table */
1441 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001442 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001443#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001444
Victor Stinner99b95382011-07-04 14:23:54 +02001445#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001446
Guido van Rossumefec1152000-03-28 02:01:15 +00001447/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001448
Mark Hammond91a681d2002-08-12 07:21:58 +00001449PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001450 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001451 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001452 const char *errors /* error handling */
1453 );
1454
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001455PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1456 const char *string, /* MBCS encoded string */
1457 Py_ssize_t length, /* size of string */
1458 const char *errors, /* error handling */
1459 Py_ssize_t *consumed /* bytes consumed */
1460 );
1461
Mark Hammond91a681d2002-08-12 07:21:58 +00001462PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001463 PyObject *unicode /* Unicode object */
1464 );
1465
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001466#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001467PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001468 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001469 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001470 const char *errors /* error handling */
1471 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001472#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001473
Victor Stinner99b95382011-07-04 14:23:54 +02001474#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001475
Guido van Rossum9e896b32000-04-05 20:11:21 +00001476/* --- Decimal Encoder ---------------------------------------------------- */
1477
1478/* Takes a Unicode string holding a decimal value and writes it into
1479 an output buffer using standard ASCII digit codes.
1480
1481 The output buffer has to provide at least length+1 bytes of storage
1482 area. The output string is 0-terminated.
1483
1484 The encoder converts whitespace to ' ', decimal characters to their
1485 corresponding ASCII digit and all other Latin-1 characters except
1486 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1487 are treated as errors. This includes embedded NULL bytes.
1488
1489 Error handling is defined by the errors argument:
1490
1491 NULL or "strict": raise a ValueError
1492 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001493 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001494 "replace": replaces illegal characters with '?'
1495
1496 Returns 0 on success, -1 on failure.
1497
1498*/
1499
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001500#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001501PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001502 Py_UNICODE *s, /* Unicode buffer */
1503 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1504 char *output, /* Output buffer; must have size >= length */
1505 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001506 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001507#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001508
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001509/* Transforms code points that have decimal digit property to the
1510 corresponding ASCII digit code points.
1511
1512 Returns a new Unicode string on success, NULL on failure.
1513*/
1514
Georg Brandlb5503082010-12-05 11:40:48 +00001515#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001516PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1517 Py_UNICODE *s, /* Unicode buffer */
1518 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1519 );
Georg Brandlb5503082010-12-05 11:40:48 +00001520#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1523 as argument instead of a raw buffer and length. This function additionally
1524 transforms spaces to ASCII because this is what the callers in longobject,
1525 floatobject, and complexobject did anyways. */
1526
1527#ifndef Py_LIMITED_API
1528PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1529 PyObject *unicode /* Unicode object */
1530 );
1531#endif
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* --- File system encoding ---------------------------------------------- */
1534
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001535/* ParseTuple converter: encode str objects to bytes using
1536 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001537
1538PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1539
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001540/* ParseTuple converter: decode bytes objects to unicode using
1541 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1542
1543PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1544
Victor Stinner77c38622010-05-14 15:58:55 +00001545/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1546 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001547
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001548 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1549 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001550
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001551 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001552*/
1553
1554PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1555 const char *s /* encoded string */
1556 );
1557
Victor Stinner77c38622010-05-14 15:58:55 +00001558/* Decode a string using Py_FileSystemDefaultEncoding
1559 and the "surrogateescape" error handler.
1560
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001561 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1562 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001563*/
1564
Martin v. Löwis011e8422009-05-05 04:43:17 +00001565PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1566 const char *s, /* encoded string */
1567 Py_ssize_t size /* size */
1568 );
1569
Victor Stinnerae6265f2010-05-15 16:27:27 +00001570/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001571 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001572
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001573 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1574 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001575*/
1576
1577PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1578 PyObject *unicode
1579 );
1580
Guido van Rossumd8225182000-03-10 22:33:05 +00001581/* --- Methods & Slots ----------------------------------------------------
1582
1583 These are capable of handling Unicode objects and strings on input
1584 (we refer to them as strings in the descriptions) and return
1585 Unicode objects or integers as apporpriate. */
1586
1587/* Concat two strings giving a new Unicode string. */
1588
Mark Hammond91a681d2002-08-12 07:21:58 +00001589PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 PyObject *left, /* Left string */
1591 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001592 );
1593
Walter Dörwald1ab83302007-05-18 17:15:44 +00001594/* Concat two strings and put the result in *pleft
1595 (sets *pleft to NULL on error) */
1596
1597PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 PyObject **pleft, /* Pointer to left string */
1599 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001600 );
1601
1602/* Concat two strings, put the result in *pleft and drop the right object
1603 (sets *pleft to NULL on error) */
1604
1605PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606 PyObject **pleft, /* Pointer to left string */
1607 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001608 );
1609
Guido van Rossumd8225182000-03-10 22:33:05 +00001610/* Split a string giving a list of Unicode strings.
1611
1612 If sep is NULL, splitting will be done at all whitespace
1613 substrings. Otherwise, splits occur at the given separator.
1614
1615 At most maxsplit splits will be done. If negative, no limit is set.
1616
1617 Separators are not included in the resulting list.
1618
1619*/
1620
Mark Hammond91a681d2002-08-12 07:21:58 +00001621PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 PyObject *s, /* String to split */
1623 PyObject *sep, /* String separator */
1624 Py_ssize_t maxsplit /* Maxsplit count */
1625 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001626
1627/* Dito, but split at line breaks.
1628
1629 CRLF is considered to be one line break. Line breaks are not
1630 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631
Mark Hammond91a681d2002-08-12 07:21:58 +00001632PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 PyObject *s, /* String to split */
1634 int keepends /* If true, line end markers are included */
1635 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001636
Thomas Wouters477c8d52006-05-27 19:21:47 +00001637/* Partition a string using a given separator. */
1638
1639PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001640 PyObject *s, /* String to partition */
1641 PyObject *sep /* String separator */
1642 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001643
1644/* Partition a string using a given separator, searching from the end of the
1645 string. */
1646
1647PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001648 PyObject *s, /* String to partition */
1649 PyObject *sep /* String separator */
1650 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001651
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001652/* Split a string giving a list of Unicode strings.
1653
1654 If sep is NULL, splitting will be done at all whitespace
1655 substrings. Otherwise, splits occur at the given separator.
1656
1657 At most maxsplit splits will be done. But unlike PyUnicode_Split
1658 PyUnicode_RSplit splits from the end of the string. If negative,
1659 no limit is set.
1660
1661 Separators are not included in the resulting list.
1662
1663*/
1664
1665PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001666 PyObject *s, /* String to split */
1667 PyObject *sep, /* String separator */
1668 Py_ssize_t maxsplit /* Maxsplit count */
1669 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001670
Guido van Rossumd8225182000-03-10 22:33:05 +00001671/* Translate a string by applying a character mapping table to it and
1672 return the resulting Unicode object.
1673
1674 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001676
1677 Mapping tables may be dictionaries or sequences. Unmapped character
1678 ordinals (ones which cause a LookupError) are left untouched and
1679 are copied as-is.
1680
1681*/
1682
Mark Hammond91a681d2002-08-12 07:21:58 +00001683PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684 PyObject *str, /* String */
1685 PyObject *table, /* Translate table */
1686 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001687 );
1688
1689/* Join a sequence of strings using the given separator and return
1690 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691
Mark Hammond91a681d2002-08-12 07:21:58 +00001692PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 PyObject *separator, /* Separator string */
1694 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001695 );
1696
1697/* Return 1 if substr matches str[start:end] at the given tail end, 0
1698 otherwise. */
1699
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 PyObject *str, /* String */
1702 PyObject *substr, /* Prefix or Suffix string */
1703 Py_ssize_t start, /* Start index */
1704 Py_ssize_t end, /* Stop index */
1705 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001706 );
1707
1708/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001709 given search direction or -1 if not found. -2 is returned in case
1710 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001711
Martin v. Löwis18e16552006-02-15 17:27:45 +00001712PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713 PyObject *str, /* String */
1714 PyObject *substr, /* Substring to find */
1715 Py_ssize_t start, /* Start index */
1716 Py_ssize_t end, /* Stop index */
1717 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001718 );
1719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720/* Like PyUnicode_Find, but search for single character only. */
1721PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1722 PyObject *str,
1723 Py_UCS4 ch,
1724 Py_ssize_t start,
1725 Py_ssize_t end,
1726 int direction
1727 );
1728
Barry Warsaw51ac5802000-03-20 16:36:48 +00001729/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001730
Martin v. Löwis18e16552006-02-15 17:27:45 +00001731PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 PyObject *str, /* String */
1733 PyObject *substr, /* Substring to count */
1734 Py_ssize_t start, /* Start index */
1735 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001736 );
1737
Barry Warsaw51ac5802000-03-20 16:36:48 +00001738/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001739 and return the resulting Unicode object. */
1740
Mark Hammond91a681d2002-08-12 07:21:58 +00001741PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 PyObject *str, /* String */
1743 PyObject *substr, /* Substring to find */
1744 PyObject *replstr, /* Substring to replace */
1745 Py_ssize_t maxcount /* Max. number of replacements to apply;
1746 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001747 );
1748
1749/* Compare two strings and return -1, 0, 1 for less than, equal,
1750 greater than resp. */
1751
Mark Hammond91a681d2002-08-12 07:21:58 +00001752PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 PyObject *left, /* Left string */
1754 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001755 );
1756
Martin v. Löwis5b222132007-06-10 09:51:05 +00001757PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1758 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001759 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001760 );
1761
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001762/* Rich compare two strings and return one of the following:
1763
1764 - NULL in case an exception was raised
1765 - Py_True or Py_False for successfuly comparisons
1766 - Py_NotImplemented in case the type combination is unknown
1767
1768 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1769 case the conversion of the arguments to Unicode fails with a
1770 UnicodeDecodeError.
1771
1772 Possible values for op:
1773
1774 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1775
1776*/
1777
1778PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 PyObject *left, /* Left string */
1780 PyObject *right, /* Right string */
1781 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001782 );
1783
Thomas Wouters7e474022000-07-16 12:04:32 +00001784/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001785 the resulting Unicode string. */
1786
Mark Hammond91a681d2002-08-12 07:21:58 +00001787PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001788 PyObject *format, /* Format string */
1789 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001790 );
1791
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001792/* Checks whether element is contained in container and return 1/0
1793 accordingly.
1794
1795 element has to coerce to an one element Unicode string. -1 is
1796 returned in case of an error. */
1797
Mark Hammond91a681d2002-08-12 07:21:58 +00001798PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001799 PyObject *container, /* Container string */
1800 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001801 );
1802
Martin v. Löwis47383402007-08-15 07:32:56 +00001803/* Checks whether argument is a valid identifier. */
1804
1805PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1806
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001807#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001808/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001809PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001810 PyUnicodeObject *self,
1811 int striptype,
1812 PyObject *sepobj
1813 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001814#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001815
Eric Smith5807c412008-05-11 21:00:57 +00001816/* Using the current locale, insert the thousands grouping
1817 into the string pointed to by buffer. For the argument descriptions,
1818 see Objects/stringlib/localeutil.h */
1819
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001820#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001821PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1822 Py_ssize_t n_buffer,
1823 Py_UNICODE *digits,
1824 Py_ssize_t n_digits,
1825 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001826#endif
Eric Smith5807c412008-05-11 21:00:57 +00001827
Eric Smitha3b1ac82009-04-03 14:45:06 +00001828/* Using explicit passed-in values, insert the thousands grouping
1829 into the string pointed to by buffer. For the argument descriptions,
1830 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001831#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1833 int kind,
1834 void *buffer,
1835 Py_ssize_t n_buffer,
1836 void *digits,
1837 Py_ssize_t n_digits,
1838 Py_ssize_t min_width,
1839 const char *grouping,
1840 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001841#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001842/* === Characters Type APIs =============================================== */
1843
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001844/* Helper array used by Py_UNICODE_ISSPACE(). */
1845
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001846#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001847PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1848
Guido van Rossumd8225182000-03-10 22:33:05 +00001849/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001851
1852 These APIs are implemented in Objects/unicodectype.c.
1853
1854*/
1855
Mark Hammond91a681d2002-08-12 07:21:58 +00001856PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001857 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001858 );
1859
Mark Hammond91a681d2002-08-12 07:21:58 +00001860PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001861 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001862 );
1863
Mark Hammond91a681d2002-08-12 07:21:58 +00001864PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001865 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001866 );
1867
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001868PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001869 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001870 );
1871
1872PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001873 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001874 );
1875
Mark Hammond91a681d2002-08-12 07:21:58 +00001876PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001877 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001878 );
1879
Mark Hammond91a681d2002-08-12 07:21:58 +00001880PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001881 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001882 );
1883
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001884PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1885 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001886 );
1887
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001888PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1889 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001890 );
1891
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001892PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1893 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001894 );
1895
Mark Hammond91a681d2002-08-12 07:21:58 +00001896PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001897 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001898 );
1899
Mark Hammond91a681d2002-08-12 07:21:58 +00001900PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001901 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001902 );
1903
Mark Hammond91a681d2002-08-12 07:21:58 +00001904PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001905 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001906 );
1907
Mark Hammond91a681d2002-08-12 07:21:58 +00001908PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001909 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001910 );
1911
Mark Hammond91a681d2002-08-12 07:21:58 +00001912PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001913 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001914 );
1915
Mark Hammond91a681d2002-08-12 07:21:58 +00001916PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001917 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001918 );
1919
Georg Brandl559e5d72008-06-11 18:37:52 +00001920PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001921 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001922 );
1923
Mark Hammond91a681d2002-08-12 07:21:58 +00001924PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001925 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001926 );
1927
Victor Stinneref8d95c2010-08-16 22:03:11 +00001928PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1929 const Py_UNICODE *u
1930 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001931
1932PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001933 Py_UNICODE *s1,
1934 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001935
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001936PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1937 Py_UNICODE *s1, const Py_UNICODE *s2);
1938
Martin v. Löwis5b222132007-06-10 09:51:05 +00001939PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001940 Py_UNICODE *s1,
1941 const Py_UNICODE *s2,
1942 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001943
1944PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001945 const Py_UNICODE *s1,
1946 const Py_UNICODE *s2
1947 );
1948
1949PyAPI_FUNC(int) Py_UNICODE_strncmp(
1950 const Py_UNICODE *s1,
1951 const Py_UNICODE *s2,
1952 size_t n
1953 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001954
1955PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001956 const Py_UNICODE *s,
1957 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001958 );
1959
Victor Stinner331ea922010-08-10 16:37:20 +00001960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001961 const Py_UNICODE *s,
1962 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001963 );
1964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965PyAPI_FUNC(size_t) Py_UCS4_strlen(
1966 const Py_UCS4 *u
1967 );
1968
1969PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1970 Py_UCS4 *s1,
1971 const Py_UCS4 *s2);
1972
1973PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1974 Py_UCS4 *s1, const Py_UCS4 *s2);
1975
1976PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1977 Py_UCS4 *s1,
1978 const Py_UCS4 *s2,
1979 size_t n);
1980
1981PyAPI_FUNC(int) Py_UCS4_strcmp(
1982 const Py_UCS4 *s1,
1983 const Py_UCS4 *s2
1984 );
1985
1986PyAPI_FUNC(int) Py_UCS4_strncmp(
1987 const Py_UCS4 *s1,
1988 const Py_UCS4 *s2,
1989 size_t n
1990 );
1991
1992PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1993 const Py_UCS4 *s,
1994 Py_UCS4 c
1995 );
1996
1997PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1998 const Py_UCS4 *s,
1999 Py_UCS4 c
2000 );
2001
Victor Stinner71133ff2010-09-01 23:43:53 +00002002/* Create a copy of a unicode string ending with a nul character. Return NULL
2003 and raise a MemoryError exception on memory allocation failure, otherwise
2004 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2005
Victor Stinner46408602010-09-03 16:18:00 +00002006PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002007 PyObject *unicode
2008 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002009#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002010
Guido van Rossumd8225182000-03-10 22:33:05 +00002011#ifdef __cplusplus
2012}
2013#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002014#endif /* !Py_UNICODEOBJECT_H */