blob: 8e19ebc0ad3acc57bde5fe122bc11bac11416ae0 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200227 * ascii = 0
Victor Stinner85041a52011-10-03 14:42:39 +0200228 * utf8 != data
Victor Stinner7f11ad42011-10-04 00:00:20 +0200229 * wstr is shared with data if kind=PyUnicode_2BYTE_KIND
230 and sizeof(wchar_t)=2 or if kind=PyUnicode_4BYTE_KIND and
231 sizeof(wchar_4)=4
Victor Stinner910337b2011-10-03 03:20:16 +0200232
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200233 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200234
235 * structure = PyUnicodeObject
236 * kind = PyUnicode_WCHAR_KIND
237 * compact = 0
238 * ready = 0
239 * wstr is not NULL
240 * data.any is NULL
241 * utf8 is NULL
242 * interned = SSTATE_NOT_INTERNED
Victor Stinnera3b334d2011-10-03 13:53:37 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200245 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200246
247 * structure = PyUnicodeObject structure
248 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
249 PyUnicode_4BYTE_KIND
250 * compact = 0
251 * ready = 1
252 * data.any is not NULL
Victor Stinner7f11ad42011-10-04 00:00:20 +0200253 * utf8 is shared with data.any if ascii = 1
254 * wstr is shared with data.any if kind=PyUnicode_2BYTE_KIND
255 and sizeof(wchar_t)=2 or if kind=PyUnicode_4BYTE_KIND and
256 sizeof(wchar_4)=4
Victor Stinner910337b2011-10-03 03:20:16 +0200257
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200258 Compact strings use only one memory block (structure + characters),
259 whereas legacy strings use one block for the structure and one block
260 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200261
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200262 Legacy strings are created by PyUnicode_FromUnicode() and
263 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
264 when PyUnicode_READY() is called.
265
266 See also _PyUnicode_CheckConsistency().
267 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000268 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200269 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000270 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200271 struct {
272 /*
273 SSTATE_NOT_INTERNED (0)
274 SSTATE_INTERNED_MORTAL (1)
275 SSTATE_INTERNED_IMMORTAL (2)
276
277 If interned != SSTATE_NOT_INTERNED, the two references from the
278 dictionary to this object are *not* counted in ob_refcnt.
279 */
280 unsigned int interned:2;
281 /* Character size:
282
283 PyUnicode_WCHAR_KIND (0): wchar_t*
284 PyUnicode_1BYTE_KIND (1): Py_UCS1*
285 PyUnicode_2BYTE_KIND (2): Py_UCS2*
286 PyUnicode_4BYTE_KIND (3): Py_UCS4*
287 */
288 unsigned int kind:2;
289 /* Compact is with respect to the allocation scheme. Compact unicode
290 objects only require one memory block while non-compact objects use
291 one block for the PyUnicodeObject struct and another for its data
292 buffer. */
293 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200294 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
295 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
296 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200297 unsigned int ascii:1;
298 /* The ready flag indicates whether the object layout is initialized
299 completely. This means that this is either a compact object, or
300 the data pointer is filled out. The bit is redundant, and helps
301 to minimize the test in PyUnicode_IS_READY(). */
302 unsigned int ready:1;
303 } state;
304 wchar_t *wstr; /* wchar_t representation (null-terminated) */
305} PyASCIIObject;
306
307/* Non-ASCII strings allocated through PyUnicode_New use the
308 PyCompactUnicodeOject structure. state.compact is set, and the data
309 immediately follow the structure. */
310typedef struct {
311 PyASCIIObject _base;
312 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
313 * terminating \0. */
314 char *utf8; /* UTF-8 representation (null-terminated) */
315 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
316 * surrogates count as two code points. */
317} PyCompactUnicodeObject;
318
319/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
320 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322typedef struct {
323 PyCompactUnicodeObject _base;
324 union {
325 void *any;
326 Py_UCS1 *latin1;
327 Py_UCS2 *ucs2;
328 Py_UCS4 *ucs4;
329 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000330} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000331#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000332
Mark Hammond91a681d2002-08-12 07:21:58 +0000333PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000334PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000335
Thomas Wouters27d517b2007-02-25 20:39:11 +0000336#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000337 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
338#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000339
340/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000341#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342
343#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200344 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345 ((PyASCIIObject*)op)->length : \
346 ((PyCompactUnicodeObject*)op)->wstr_length)
347
348/* Returns the deprecated Py_UNICODE representation's size in code units
349 (this includes surrogate pairs as 2 units).
350 If the Py_UNICODE representation is not available, it will be computed
351 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
352
Guido van Rossumd8225182000-03-10 22:33:05 +0000353#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 (assert(PyUnicode_Check(op)), \
355 (((PyASCIIObject *)(op))->wstr) ? \
356 PyUnicode_WSTR_LENGTH(op) : \
357 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
358 PyUnicode_WSTR_LENGTH(op)))
359
Guido van Rossumd8225182000-03-10 22:33:05 +0000360#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200361 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
362
363/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
364 representation on demand. Using this macro is very inefficient now,
365 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
366 use PyUnicode_WRITE() and PyUnicode_READ(). */
367
Guido van Rossumd8225182000-03-10 22:33:05 +0000368#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 (assert(PyUnicode_Check(op)), \
370 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
371 PyUnicode_AsUnicode((PyObject *)(op)))
372
Guido van Rossumd8225182000-03-10 22:33:05 +0000373#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200374 ((const char *)(PyUnicode_AS_UNICODE(op)))
375
376
377/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
378
379/* Values for PyUnicodeObject.state: */
380
381/* Interning state. */
382#define SSTATE_NOT_INTERNED 0
383#define SSTATE_INTERNED_MORTAL 1
384#define SSTATE_INTERNED_IMMORTAL 2
385
Victor Stinnera3b334d2011-10-03 13:53:37 +0200386/* Return true if the string contains only ASCII characters, or 0 if not. The
387 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
388 or Ready calls are performed. */
389#define PyUnicode_IS_ASCII(op) \
390 (((PyASCIIObject*)op)->state.ascii)
391
392/* Return true if the string is compact or 0 if not.
393 No type checks or Ready calls are performed. */
394#define PyUnicode_IS_COMPACT(op) \
395 (((PyASCIIObject*)(op))->state.compact)
396
397/* Return true if the string is a compact ASCII string (use PyASCIIObject
398 structure), or 0 if not. No type checks or Ready calls are performed. */
399#define PyUnicode_IS_COMPACT_ASCII(op) \
400 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401
402/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200403 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 has not been called yet. */
405#define PyUnicode_WCHAR_KIND 0
406
407/* Return values of the PyUnicode_KIND() macro: */
408
409#define PyUnicode_1BYTE_KIND 1
410#define PyUnicode_2BYTE_KIND 2
411#define PyUnicode_4BYTE_KIND 3
412
413
414/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200415 this can be 1, 2 or 4.
416
417 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418#define PyUnicode_CHARACTER_SIZE(op) \
419 (1 << (PyUnicode_KIND(op) - 1))
420
421/* Return pointers to the canonical representation casted as unsigned char,
422 Py_UCS2, or Py_UCS4 for direct character access.
423 No checks are performed, use PyUnicode_CHARACTER_SIZE or
424 PyUnicode_KIND() before to ensure these will work correctly. */
425
426#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
427#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
428#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
429
Victor Stinner157f83f2011-09-28 21:41:31 +0200430/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431#define PyUnicode_KIND(op) \
432 (assert(PyUnicode_Check(op)), \
433 assert(PyUnicode_IS_READY(op)), \
434 ((PyASCIIObject *)(op))->state.kind)
435
Victor Stinner157f83f2011-09-28 21:41:31 +0200436/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200437#define _PyUnicode_COMPACT_DATA(op) \
438 (PyUnicode_IS_COMPACT_ASCII(op) ? \
439 ((void*)((PyASCIIObject*)(op) + 1)) : \
440 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
441
442#define _PyUnicode_NONCOMPACT_DATA(op) \
443 (assert(((PyUnicodeObject*)(op))->data.any), \
444 ((((PyUnicodeObject *)(op))->data.any)))
445
446#define PyUnicode_DATA(op) \
447 (assert(PyUnicode_Check(op)), \
448 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
449 _PyUnicode_NONCOMPACT_DATA(op))
450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200452 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453
Victor Stinner4584a5b2011-10-01 02:39:37 +0200454 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
456
457/* In the access macros below, "kind" may be evaluated more than once.
458 All other macro parameters are evaluated exactly once, so it is safe
459 to put side effects into them (such as increasing the index). */
460
461/* Write into the canonical representation, this macro does not do any sanity
462 checks and is intended for usage in loops. The caller should cache the
463 kind and data pointers optained form other macro calls.
464 index is the index in the string (starts at 0) and value is the new
465 code point value which shoule be written to that location. */
466#define PyUnicode_WRITE(kind, data, index, value) \
467 do { \
468 switch ((kind)) { \
469 case PyUnicode_1BYTE_KIND: { \
470 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
471 break; \
472 } \
473 case PyUnicode_2BYTE_KIND: { \
474 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
475 break; \
476 } \
477 default: { \
478 assert((kind) == PyUnicode_4BYTE_KIND); \
479 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
480 } \
481 } \
482 } while (0)
483
484/* Read a code point form the string's canonical representation. No checks
485 or ready calls are performed. */
486#define PyUnicode_READ(kind, data, index) \
487 ((Py_UCS4) \
488 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200489 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 ((kind) == PyUnicode_2BYTE_KIND ? \
491 ((const Py_UCS2 *)(data))[(index)] : \
492 ((const Py_UCS4 *)(data))[(index)] \
493 ) \
494 ))
495
496/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
497 calls PyUnicode_KIND() and might call it twice. For single reads, use
498 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
499 cache kind and use PyUnicode_READ instead. */
500#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200501 (assert(PyUnicode_Check(unicode)), \
502 assert(PyUnicode_IS_READY(unicode)), \
503 (Py_UCS4) \
504 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
505 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
506 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
507 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
508 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
509 ) \
510 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511
512/* Returns the length of the unicode string. The caller has to make sure that
513 the string has it's canonical representation set before calling
514 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
515#define PyUnicode_GET_LENGTH(op) \
516 (assert(PyUnicode_Check(op)), \
517 assert(PyUnicode_IS_READY(op)), \
518 ((PyASCIIObject *)(op))->length)
519
520
521/* Fast check to determine whether an object is ready. Equivalent to
522 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
523
524#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
525
Victor Stinnera3b334d2011-10-03 13:53:37 +0200526/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200527 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200528 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529 Returns 0 on success and -1 on errors. */
530#define PyUnicode_READY(op) \
531 (assert(PyUnicode_Check(op)), \
532 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200533 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535/* Return a maximum character value which is suitable for creating another
536 string based on op. This is always an approximation but more efficient
537 than interating over the string. */
538#define PyUnicode_MAX_CHAR_VALUE(op) \
539 (assert(PyUnicode_IS_READY(op)), \
540 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
541 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
542 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
543 (0x7fU) : (0xffU) \
544 ) : \
545 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
546 (0xffffU) : (0x10ffffU) \
547 ))))
548
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000549#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000550
551/* --- Constants ---------------------------------------------------------- */
552
553/* This Unicode character will be used as replacement character during
554 decoding if the errors argument is set to "replace". Note: the
555 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
556 Unicode 3.0. */
557
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200558#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000559
560/* === Public API ========================================================= */
561
562/* --- Plain Py_UNICODE --------------------------------------------------- */
563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564/* With PEP 393, this is the recommended way to allocate a new unicode object.
565 This function will allocate the object and its buffer in a single memory
566 block. Objects created using this function are not resizable. */
567#ifndef Py_LIMITED_API
568PyAPI_FUNC(PyObject*) PyUnicode_New(
569 Py_ssize_t size, /* Number of code points in the new string */
570 Py_UCS4 maxchar /* maximum code point value in the string */
571 );
572#endif
573
Victor Stinnerd8f65102011-09-29 19:43:17 +0200574/* Initializes the canonical string representation from a the deprecated
575 wstr/Py_UNICODE representation. This function is used to convert Unicode
576 objects which were created using the old API to the new flexible format
577 introduced with PEP 393.
578
579 Don't call this function directly, use the public PyUnicode_READY() macro
580 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581#ifndef Py_LIMITED_API
582PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200583 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200584 );
585#endif
586
Victor Stinner034f6cf2011-09-30 02:26:44 +0200587/* Get a copy of a Unicode string. */
588PyAPI_FUNC(PyObject*) PyUnicode_Copy(
589 PyObject *unicode
590 );
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200593 character conversion when necessary and falls back to memcpy if possible.
594
Victor Stinnera0702ab2011-09-29 14:14:38 +0200595 Fail if to is too small (smaller than how_many or smaller than
596 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
597 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200598
599 Return the number of written character, or return -1 and raise an exception
600 on error.
601
602 Pseudo-code:
603
604 how_many = min(how_many, len(from) - from_start)
605 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
606 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200607
608 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200611PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612 PyObject *to,
613 Py_ssize_t to_start,
614 PyObject *from,
615 Py_ssize_t from_start,
616 Py_ssize_t how_many
617 );
618#endif
619
Guido van Rossumd8225182000-03-10 22:33:05 +0000620/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000621 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000622
623 u may be NULL which causes the contents to be undefined. It is the
624 user's responsibility to fill in the needed data afterwards. Note
625 that modifying the Unicode object contents after construction is
626 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000627
628 The buffer is copied into the new object. */
629
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000630#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000631PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000632 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000633 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000634 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000635#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000636
Georg Brandl952867a2010-06-27 10:17:12 +0000637/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000638PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000639 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000640 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000641 );
642
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000643/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000645PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000646 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000647 );
648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649#ifndef Py_LIMITED_API
650PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
651 int kind,
652 const void *buffer,
653 Py_ssize_t size);
654#endif
655
656PyAPI_FUNC(PyObject*) PyUnicode_Substring(
657 PyObject *str,
658 Py_ssize_t start,
659 Py_ssize_t end);
660
661/* Copy the string into a UCS4 buffer including the null character is copy_null
662 is set. Return NULL and raise an exception on error. Raise a ValueError if
663 the buffer is smaller than the string. Return buffer on success.
664
665 buflen is the length of the buffer in (Py_UCS4) characters. */
666PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
667 PyObject *unicode,
668 Py_UCS4* buffer,
669 Py_ssize_t buflen,
670 int copy_null);
671
672/* Copy the string into a UCS4 buffer. A new buffer is allocated using
673 * PyMem_Malloc; if this fails, NULL is returned with a memory error
674 exception set. */
675PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
676
Guido van Rossumd8225182000-03-10 22:33:05 +0000677/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200678 Py_UNICODE buffer.
679 If the wchar_t/Py_UNICODE representation is not yet available, this
680 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000681
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000682#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000683PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000685 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000686#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688/* Return a read-only pointer to the Unicode object's internal
689 Py_UNICODE buffer and save the length at size.
690 If the wchar_t/Py_UNICODE representation is not yet available, this
691 function will calculate it. */
692
693#ifndef Py_LIMITED_API
694PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
695 PyObject *unicode, /* Unicode object */
696 Py_ssize_t *size /* location where to save the length */
697 );
698#endif
699
Guido van Rossumd8225182000-03-10 22:33:05 +0000700/* Get the length of the Unicode object. */
701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
703 PyObject *unicode
704);
705
Victor Stinner157f83f2011-09-28 21:41:31 +0200706/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707 string representation. */
708
Martin v. Löwis18e16552006-02-15 17:27:45 +0000709PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000711 );
712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713/* Read a character from the string. */
714
715PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
716 PyObject *unicode,
717 Py_ssize_t index
718 );
719
720/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200721 PyUnicode_New, must not be shared, and must not have been hashed yet.
722
723 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724
725PyAPI_FUNC(int) PyUnicode_WriteChar(
726 PyObject *unicode,
727 Py_ssize_t index,
728 Py_UCS4 character
729 );
730
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000731#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000732/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000733PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000734#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000735
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200736/* Resize an Unicode object allocated by the legacy API (e.g.
737 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
738 PyUnicode_New) cannot be resized by this function.
739
740 The length is a number of Py_UNICODE characters (and not the number of code
741 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000742
743 *unicode is modified to point to the new (resized) object and 0
744 returned on success.
745
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200746 If the refcount on the object is 1, the function resizes the string in
747 place, which is usually faster than allocating a new string (and copy
748 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000749
750 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200751 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000752
Mark Hammond91a681d2002-08-12 07:21:58 +0000753PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 PyObject **unicode, /* Pointer to the Unicode object */
755 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000756 );
757
Guido van Rossumd8225182000-03-10 22:33:05 +0000758/* Coerce obj to an Unicode object and return a reference with
759 *incremented* refcount.
760
761 Coercion is done in the following way:
762
Georg Brandl952867a2010-06-27 10:17:12 +0000763 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000764 under the assumptions that they contain data using the UTF-8
765 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000766
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000767 2. All other objects (including Unicode objects) raise an
768 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000769
770 The API returns NULL in case of an error. The caller is responsible
771 for decref'ing the returned objects.
772
773*/
774
Mark Hammond91a681d2002-08-12 07:21:58 +0000775PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000777 const char *encoding, /* encoding */
778 const char *errors /* error handling */
779 );
780
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000781/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000782 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000784 Unicode objects are passed back as-is (subclasses are converted to
785 true Unicode objects), all other objects are delegated to
786 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000787 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000788
789 The API returns NULL in case of an error. The caller is responsible
790 for decref'ing the returned objects.
791
792*/
793
Mark Hammond91a681d2002-08-12 07:21:58 +0000794PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000796 );
797
Victor Stinner1205f272010-09-11 00:54:47 +0000798PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
799 const char *format, /* ASCII-encoded string */
800 va_list vargs
801 );
802PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
803 const char *format, /* ASCII-encoded string */
804 ...
805 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000806
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000807#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000808/* Format the object based on the format_spec, as defined in PEP 3101
809 (Advanced String Formatting). */
810PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811 PyObject *format_spec,
812 Py_ssize_t start,
813 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000814#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000815
Walter Dörwald16807132007-05-25 13:52:07 +0000816PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
817PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000818PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
819 const char *u /* UTF-8 encoded string */
820 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000822PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000823#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000824
825/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#define PyUnicode_CHECK_INTERNED(op) \
827 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000828
Guido van Rossumd8225182000-03-10 22:33:05 +0000829/* --- wchar_t support for platforms which support it --------------------- */
830
831#ifdef HAVE_WCHAR_H
832
Georg Brandl952867a2010-06-27 10:17:12 +0000833/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000834 size.
835
836 The buffer is copied into the new object. */
837
Mark Hammond91a681d2002-08-12 07:21:58 +0000838PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000839 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000840 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000841 );
842
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000843/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000844 most size wchar_t characters are copied.
845
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000846 Note that the resulting wchar_t string may or may not be
847 0-terminated. It is the responsibility of the caller to make sure
848 that the wchar_t string is 0-terminated in case this is required by
849 the application.
850
851 Returns the number of wchar_t characters copied (excluding a
852 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000853 error. */
854
Martin v. Löwis18e16552006-02-15 17:27:45 +0000855PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000856 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000857 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000859 );
860
Victor Stinner137c34c2010-09-29 10:25:54 +0000861/* Convert the Unicode object to a wide character string. The output string
862 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200863 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000864
865 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
866 on success. On error, returns NULL, *size is undefined and raises a
867 MemoryError. */
868
869PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000870 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000871 Py_ssize_t *size /* number of characters of the result */
872 );
873
Victor Stinner9f789e72011-10-01 03:57:28 +0200874#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200876#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877
Guido van Rossumd8225182000-03-10 22:33:05 +0000878#endif
879
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000880/* --- Unicode ordinals --------------------------------------------------- */
881
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882/* Create a Unicode Object from the given Unicode code point ordinal.
883
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000884 The ordinal must be in range(0x10000) on narrow Python builds
885 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
886 raised in case it is not.
887
888*/
889
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000890PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000891
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000892/* --- Free-list management ----------------------------------------------- */
893
894/* Clear the free list used by the Unicode implementation.
895
896 This can be used to release memory used for objects on the free
897 list back to the Python memory allocator.
898
899*/
900
901PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
902
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000904
905 Many of these APIs take two arguments encoding and errors. These
906 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000907 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000908
Georg Brandl952867a2010-06-27 10:17:12 +0000909 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000910
911 Error handling is set by errors which may also be set to NULL
912 meaning to use the default handling defined for the codec. Default
913 error handling for all builtin codecs is "strict" (ValueErrors are
914 raised).
915
916 The codecs all use a similar interface. Only deviation from the
917 generic ones are documented.
918
919*/
920
Fred Drakecb093fe2000-05-09 19:51:53 +0000921/* --- Manage the default encoding ---------------------------------------- */
922
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000923/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000924 Unicode object unicode and the size of the encoded representation
925 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000926
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000927 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000928
Victor Stinner157f83f2011-09-28 21:41:31 +0200929 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930 and subsequent calls will return the same string. The memory is relased
931 when the unicodeobject is deallocated.
932
933 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
934 support the previous internal function with the same behaviour.
935
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000936 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000937 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000938
939 *** If you need to access the Unicode object as UTF-8 bytes string,
940 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000941*/
942
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000943#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200944PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000945 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000946 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000948#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000949
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000950/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000951 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
954 in the unicodeobject.
955
956 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
957 support the previous internal function with the same behaviour.
958
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000959 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000960 extracted from the returned data.
961
962 *** This API is for interpreter INTERNAL USE ONLY and will likely
963 *** be removed or changed for Python 3.1.
964
965 *** If you need to access the Unicode object as UTF-8 bytes string,
966 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000967
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000968*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000969
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000970#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
972#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000973#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000974
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000975/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000976
Mark Hammond91a681d2002-08-12 07:21:58 +0000977PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000978
Guido van Rossumd8225182000-03-10 22:33:05 +0000979/* --- Generic Codecs ----------------------------------------------------- */
980
981/* Create a Unicode object by decoding the encoded string s of the
982 given size. */
983
Mark Hammond91a681d2002-08-12 07:21:58 +0000984PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000985 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000986 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000987 const char *encoding, /* encoding */
988 const char *errors /* error handling */
989 );
990
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000991/* Decode a Unicode object unicode and return the result as Python
992 object. */
993
994PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000995 PyObject *unicode, /* Unicode object */
996 const char *encoding, /* encoding */
997 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000998 );
999
1000/* Decode a Unicode object unicode and return the result as Unicode
1001 object. */
1002
1003PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001004 PyObject *unicode, /* Unicode object */
1005 const char *encoding, /* encoding */
1006 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001007 );
1008
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001010 Python string object. */
1011
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001012#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001014 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001015 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 const char *encoding, /* encoding */
1017 const char *errors /* error handling */
1018 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001019#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001020
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001021/* Encodes a Unicode object and returns the result as Python
1022 object. */
1023
1024PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001025 PyObject *unicode, /* Unicode object */
1026 const char *encoding, /* encoding */
1027 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001028 );
1029
Guido van Rossumd8225182000-03-10 22:33:05 +00001030/* Encodes a Unicode object and returns the result as Python string
1031 object. */
1032
Mark Hammond91a681d2002-08-12 07:21:58 +00001033PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001034 PyObject *unicode, /* Unicode object */
1035 const char *encoding, /* encoding */
1036 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001037 );
1038
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001039/* Encodes a Unicode object and returns the result as Unicode
1040 object. */
1041
1042PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 PyObject *unicode, /* Unicode object */
1044 const char *encoding, /* encoding */
1045 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001046 );
1047
1048/* Build an encoding map. */
1049
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001050PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1051 PyObject* string /* 256 character map */
1052 );
1053
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001054/* --- UTF-7 Codecs ------------------------------------------------------- */
1055
Mark Hammond91a681d2002-08-12 07:21:58 +00001056PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 const char *string, /* UTF-7 encoded string */
1058 Py_ssize_t length, /* size of string */
1059 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001060 );
1061
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001062PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 const char *string, /* UTF-7 encoded string */
1064 Py_ssize_t length, /* size of string */
1065 const char *errors, /* error handling */
1066 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001067 );
1068
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001069#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001070PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const Py_UNICODE *data, /* Unicode char buffer */
1072 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1073 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1074 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1075 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001077#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078
Guido van Rossumd8225182000-03-10 22:33:05 +00001079/* --- UTF-8 Codecs ------------------------------------------------------- */
1080
Mark Hammond91a681d2002-08-12 07:21:58 +00001081PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001082 const char *string, /* UTF-8 encoded string */
1083 Py_ssize_t length, /* size of string */
1084 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001085 );
1086
Walter Dörwald69652032004-09-07 20:24:22 +00001087PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 const char *string, /* UTF-8 encoded string */
1089 Py_ssize_t length, /* size of string */
1090 const char *errors, /* error handling */
1091 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001092 );
1093
Mark Hammond91a681d2002-08-12 07:21:58 +00001094PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001096 );
1097
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001098#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1100 PyObject *unicode,
1101 const char *errors);
1102
Mark Hammond91a681d2002-08-12 07:21:58 +00001103PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001104 const Py_UNICODE *data, /* Unicode char buffer */
1105 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1106 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001107 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001108#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001109
Walter Dörwald41980ca2007-08-16 21:55:45 +00001110/* --- UTF-32 Codecs ------------------------------------------------------ */
1111
1112/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1113 the corresponding Unicode object.
1114
1115 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001117
1118 If byteorder is non-NULL, the decoder starts decoding using the
1119 given byte order:
1120
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 *byteorder == -1: little endian
1122 *byteorder == 0: native order
1123 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001124
1125 In native mode, the first four bytes of the stream are checked for a
1126 BOM mark. If found, the BOM mark is analysed, the byte order
1127 adjusted and the BOM skipped. In the other modes, no BOM mark
1128 interpretation is done. After completion, *byteorder is set to the
1129 current byte order at the end of input data.
1130
1131 If byteorder is NULL, the codec starts in native order mode.
1132
1133*/
1134
1135PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 const char *string, /* UTF-32 encoded string */
1137 Py_ssize_t length, /* size of string */
1138 const char *errors, /* error handling */
1139 int *byteorder /* pointer to byteorder to use
1140 0=native;-1=LE,1=BE; updated on
1141 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001142 );
1143
1144PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 const char *string, /* UTF-32 encoded string */
1146 Py_ssize_t length, /* size of string */
1147 const char *errors, /* error handling */
1148 int *byteorder, /* pointer to byteorder to use
1149 0=native;-1=LE,1=BE; updated on
1150 exit */
1151 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001152 );
1153
1154/* Returns a Python string using the UTF-32 encoding in native byte
1155 order. The string always starts with a BOM mark. */
1156
1157PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001158 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001159 );
1160
1161/* Returns a Python string object holding the UTF-32 encoded value of
1162 the Unicode data.
1163
1164 If byteorder is not 0, output is written according to the following
1165 byte order:
1166
1167 byteorder == -1: little endian
1168 byteorder == 0: native byte order (writes a BOM mark)
1169 byteorder == 1: big endian
1170
1171 If byteorder is 0, the output string will always start with the
1172 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1173 prepended.
1174
1175*/
1176
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001177#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001178PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 const Py_UNICODE *data, /* Unicode char buffer */
1180 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1181 const char *errors, /* error handling */
1182 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001183 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001184#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001185
Guido van Rossumd8225182000-03-10 22:33:05 +00001186/* --- UTF-16 Codecs ------------------------------------------------------ */
1187
Guido van Rossum9e896b32000-04-05 20:11:21 +00001188/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001189 the corresponding Unicode object.
1190
1191 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001192 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001193
1194 If byteorder is non-NULL, the decoder starts decoding using the
1195 given byte order:
1196
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 *byteorder == -1: little endian
1198 *byteorder == 0: native order
1199 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001200
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001201 In native mode, the first two bytes of the stream are checked for a
1202 BOM mark. If found, the BOM mark is analysed, the byte order
1203 adjusted and the BOM skipped. In the other modes, no BOM mark
1204 interpretation is done. After completion, *byteorder is set to the
1205 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001206
1207 If byteorder is NULL, the codec starts in native order mode.
1208
1209*/
1210
Mark Hammond91a681d2002-08-12 07:21:58 +00001211PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001212 const char *string, /* UTF-16 encoded string */
1213 Py_ssize_t length, /* size of string */
1214 const char *errors, /* error handling */
1215 int *byteorder /* pointer to byteorder to use
1216 0=native;-1=LE,1=BE; updated on
1217 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001218 );
1219
Walter Dörwald69652032004-09-07 20:24:22 +00001220PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001221 const char *string, /* UTF-16 encoded string */
1222 Py_ssize_t length, /* size of string */
1223 const char *errors, /* error handling */
1224 int *byteorder, /* pointer to byteorder to use
1225 0=native;-1=LE,1=BE; updated on
1226 exit */
1227 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001228 );
1229
Guido van Rossumd8225182000-03-10 22:33:05 +00001230/* Returns a Python string using the UTF-16 encoding in native byte
1231 order. The string always starts with a BOM mark. */
1232
Mark Hammond91a681d2002-08-12 07:21:58 +00001233PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001235 );
1236
1237/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001238 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
1240 If byteorder is not 0, output is written according to the following
1241 byte order:
1242
1243 byteorder == -1: little endian
1244 byteorder == 0: native byte order (writes a BOM mark)
1245 byteorder == 1: big endian
1246
1247 If byteorder is 0, the output string will always start with the
1248 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1249 prepended.
1250
1251 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1252 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001253 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001254
1255*/
1256
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001257#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001258PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 const Py_UNICODE *data, /* Unicode char buffer */
1260 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1261 const char *errors, /* error handling */
1262 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001263 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001264#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001265
1266/* --- Unicode-Escape Codecs ---------------------------------------------- */
1267
Mark Hammond91a681d2002-08-12 07:21:58 +00001268PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 const char *string, /* Unicode-Escape encoded string */
1270 Py_ssize_t length, /* size of string */
1271 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001272 );
1273
Mark Hammond91a681d2002-08-12 07:21:58 +00001274PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001276 );
1277
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001278#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001279PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 const Py_UNICODE *data, /* Unicode char buffer */
1281 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001282 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001283#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001284
1285/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1286
Mark Hammond91a681d2002-08-12 07:21:58 +00001287PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 const char *string, /* Raw-Unicode-Escape encoded string */
1289 Py_ssize_t length, /* size of string */
1290 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001291 );
1292
Mark Hammond91a681d2002-08-12 07:21:58 +00001293PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 );
1296
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001298PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 const Py_UNICODE *data, /* Unicode char buffer */
1300 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001301 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001302#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001303
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001304/* --- Unicode Internal Codec ---------------------------------------------
1305
1306 Only for internal use in _codecsmodule.c */
1307
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001308#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001309PyObject *_PyUnicode_DecodeUnicodeInternal(
1310 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001311 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001312 const char *errors
1313 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001315
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001317
1318 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1319
1320*/
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 const char *string, /* Latin-1 encoded string */
1324 Py_ssize_t length, /* size of string */
1325 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
1327
Mark Hammond91a681d2002-08-12 07:21:58 +00001328PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001330 );
1331
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001332#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1334 PyObject* unicode,
1335 const char* errors);
1336
Mark Hammond91a681d2002-08-12 07:21:58 +00001337PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 const Py_UNICODE *data, /* Unicode char buffer */
1339 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1340 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001341 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001343
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001345
1346 Only 7-bit ASCII data is excepted. All other codes generate errors.
1347
1348*/
1349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 const char *string, /* ASCII encoded string */
1352 Py_ssize_t length, /* size of string */
1353 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001354 );
1355
Mark Hammond91a681d2002-08-12 07:21:58 +00001356PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001358 );
1359
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001360#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1362 PyObject* unicode,
1363 const char* errors);
1364
Mark Hammond91a681d2002-08-12 07:21:58 +00001365PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 const Py_UNICODE *data, /* Unicode char buffer */
1367 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1368 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001369 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001370#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001371
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001373
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001375
1376 Decoding mappings must map single string characters to single
1377 Unicode characters, integers (which are then interpreted as Unicode
1378 ordinals) or None (meaning "undefined mapping" and causing an
1379 error).
1380
1381 Encoding mappings must map single Unicode characters to single
1382 string characters, integers (which are then interpreted as Latin-1
1383 ordinals) or None (meaning "undefined mapping" and causing an
1384 error).
1385
1386 If a character lookup fails with a LookupError, the character is
1387 copied as-is meaning that its ordinal value will be interpreted as
1388 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1389 to contain those mappings which map characters to different code
1390 points.
1391
1392*/
1393
Mark Hammond91a681d2002-08-12 07:21:58 +00001394PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 const char *string, /* Encoded string */
1396 Py_ssize_t length, /* size of string */
1397 PyObject *mapping, /* character mapping
1398 (char ordinal -> unicode ordinal) */
1399 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001400 );
1401
Mark Hammond91a681d2002-08-12 07:21:58 +00001402PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 PyObject *unicode, /* Unicode object */
1404 PyObject *mapping /* character mapping
1405 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001406 );
1407
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001408#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001409PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 const Py_UNICODE *data, /* Unicode char buffer */
1411 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1412 PyObject *mapping, /* character mapping
1413 (unicode ordinal -> char ordinal) */
1414 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001415 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001416#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001417
1418/* Translate a Py_UNICODE buffer of the given length by applying a
1419 character mapping table to it and return the resulting Unicode
1420 object.
1421
1422 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001424
1425 Mapping tables may be dictionaries or sequences. Unmapped character
1426 ordinals (ones which cause a LookupError) are left untouched and
1427 are copied as-is.
1428
1429*/
1430
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001431#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001432PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 const Py_UNICODE *data, /* Unicode char buffer */
1434 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1435 PyObject *table, /* Translate table */
1436 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001437 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001438#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001439
Victor Stinner99b95382011-07-04 14:23:54 +02001440#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001441
Guido van Rossumefec1152000-03-28 02:01:15 +00001442/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001443
Mark Hammond91a681d2002-08-12 07:21:58 +00001444PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001445 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001446 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001447 const char *errors /* error handling */
1448 );
1449
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001450PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1451 const char *string, /* MBCS encoded string */
1452 Py_ssize_t length, /* size of string */
1453 const char *errors, /* error handling */
1454 Py_ssize_t *consumed /* bytes consumed */
1455 );
1456
Mark Hammond91a681d2002-08-12 07:21:58 +00001457PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001458 PyObject *unicode /* Unicode object */
1459 );
1460
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001461#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001462PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001463 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001464 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001465 const char *errors /* error handling */
1466 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001467#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001468
Victor Stinner99b95382011-07-04 14:23:54 +02001469#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001470
Guido van Rossum9e896b32000-04-05 20:11:21 +00001471/* --- Decimal Encoder ---------------------------------------------------- */
1472
1473/* Takes a Unicode string holding a decimal value and writes it into
1474 an output buffer using standard ASCII digit codes.
1475
1476 The output buffer has to provide at least length+1 bytes of storage
1477 area. The output string is 0-terminated.
1478
1479 The encoder converts whitespace to ' ', decimal characters to their
1480 corresponding ASCII digit and all other Latin-1 characters except
1481 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1482 are treated as errors. This includes embedded NULL bytes.
1483
1484 Error handling is defined by the errors argument:
1485
1486 NULL or "strict": raise a ValueError
1487 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001488 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001489 "replace": replaces illegal characters with '?'
1490
1491 Returns 0 on success, -1 on failure.
1492
1493*/
1494
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001496PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 Py_UNICODE *s, /* Unicode buffer */
1498 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1499 char *output, /* Output buffer; must have size >= length */
1500 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001501 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001502#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001503
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001504/* Transforms code points that have decimal digit property to the
1505 corresponding ASCII digit code points.
1506
1507 Returns a new Unicode string on success, NULL on failure.
1508*/
1509
Georg Brandlb5503082010-12-05 11:40:48 +00001510#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001511PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1512 Py_UNICODE *s, /* Unicode buffer */
1513 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1514 );
Georg Brandlb5503082010-12-05 11:40:48 +00001515#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1518 as argument instead of a raw buffer and length. This function additionally
1519 transforms spaces to ASCII because this is what the callers in longobject,
1520 floatobject, and complexobject did anyways. */
1521
1522#ifndef Py_LIMITED_API
1523PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1524 PyObject *unicode /* Unicode object */
1525 );
1526#endif
1527
Martin v. Löwis011e8422009-05-05 04:43:17 +00001528/* --- File system encoding ---------------------------------------------- */
1529
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001530/* ParseTuple converter: encode str objects to bytes using
1531 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001532
1533PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1534
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001535/* ParseTuple converter: decode bytes objects to unicode using
1536 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1537
1538PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1539
Victor Stinner77c38622010-05-14 15:58:55 +00001540/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1541 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001542
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001543 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1544 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001545
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001546 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001547*/
1548
1549PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1550 const char *s /* encoded string */
1551 );
1552
Victor Stinner77c38622010-05-14 15:58:55 +00001553/* Decode a string using Py_FileSystemDefaultEncoding
1554 and the "surrogateescape" error handler.
1555
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001556 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1557 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001558*/
1559
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1561 const char *s, /* encoded string */
1562 Py_ssize_t size /* size */
1563 );
1564
Victor Stinnerae6265f2010-05-15 16:27:27 +00001565/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001566 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001567
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001568 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1569 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001570*/
1571
1572PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1573 PyObject *unicode
1574 );
1575
Guido van Rossumd8225182000-03-10 22:33:05 +00001576/* --- Methods & Slots ----------------------------------------------------
1577
1578 These are capable of handling Unicode objects and strings on input
1579 (we refer to them as strings in the descriptions) and return
1580 Unicode objects or integers as apporpriate. */
1581
1582/* Concat two strings giving a new Unicode string. */
1583
Mark Hammond91a681d2002-08-12 07:21:58 +00001584PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001585 PyObject *left, /* Left string */
1586 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001587 );
1588
Walter Dörwald1ab83302007-05-18 17:15:44 +00001589/* Concat two strings and put the result in *pleft
1590 (sets *pleft to NULL on error) */
1591
1592PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 PyObject **pleft, /* Pointer to left string */
1594 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001595 );
1596
1597/* Concat two strings, put the result in *pleft and drop the right object
1598 (sets *pleft to NULL on error) */
1599
1600PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 PyObject **pleft, /* Pointer to left string */
1602 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001603 );
1604
Guido van Rossumd8225182000-03-10 22:33:05 +00001605/* Split a string giving a list of Unicode strings.
1606
1607 If sep is NULL, splitting will be done at all whitespace
1608 substrings. Otherwise, splits occur at the given separator.
1609
1610 At most maxsplit splits will be done. If negative, no limit is set.
1611
1612 Separators are not included in the resulting list.
1613
1614*/
1615
Mark Hammond91a681d2002-08-12 07:21:58 +00001616PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 PyObject *s, /* String to split */
1618 PyObject *sep, /* String separator */
1619 Py_ssize_t maxsplit /* Maxsplit count */
1620 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001621
1622/* Dito, but split at line breaks.
1623
1624 CRLF is considered to be one line break. Line breaks are not
1625 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626
Mark Hammond91a681d2002-08-12 07:21:58 +00001627PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 PyObject *s, /* String to split */
1629 int keepends /* If true, line end markers are included */
1630 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001631
Thomas Wouters477c8d52006-05-27 19:21:47 +00001632/* Partition a string using a given separator. */
1633
1634PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 PyObject *s, /* String to partition */
1636 PyObject *sep /* String separator */
1637 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001638
1639/* Partition a string using a given separator, searching from the end of the
1640 string. */
1641
1642PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 PyObject *s, /* String to partition */
1644 PyObject *sep /* String separator */
1645 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001646
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001647/* Split a string giving a list of Unicode strings.
1648
1649 If sep is NULL, splitting will be done at all whitespace
1650 substrings. Otherwise, splits occur at the given separator.
1651
1652 At most maxsplit splits will be done. But unlike PyUnicode_Split
1653 PyUnicode_RSplit splits from the end of the string. If negative,
1654 no limit is set.
1655
1656 Separators are not included in the resulting list.
1657
1658*/
1659
1660PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001661 PyObject *s, /* String to split */
1662 PyObject *sep, /* String separator */
1663 Py_ssize_t maxsplit /* Maxsplit count */
1664 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001665
Guido van Rossumd8225182000-03-10 22:33:05 +00001666/* Translate a string by applying a character mapping table to it and
1667 return the resulting Unicode object.
1668
1669 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001671
1672 Mapping tables may be dictionaries or sequences. Unmapped character
1673 ordinals (ones which cause a LookupError) are left untouched and
1674 are copied as-is.
1675
1676*/
1677
Mark Hammond91a681d2002-08-12 07:21:58 +00001678PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 PyObject *str, /* String */
1680 PyObject *table, /* Translate table */
1681 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001682 );
1683
1684/* Join a sequence of strings using the given separator and return
1685 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686
Mark Hammond91a681d2002-08-12 07:21:58 +00001687PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 PyObject *separator, /* Separator string */
1689 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001690 );
1691
1692/* Return 1 if substr matches str[start:end] at the given tail end, 0
1693 otherwise. */
1694
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 PyObject *str, /* String */
1697 PyObject *substr, /* Prefix or Suffix string */
1698 Py_ssize_t start, /* Start index */
1699 Py_ssize_t end, /* Stop index */
1700 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001701 );
1702
1703/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001704 given search direction or -1 if not found. -2 is returned in case
1705 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001706
Martin v. Löwis18e16552006-02-15 17:27:45 +00001707PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 PyObject *str, /* String */
1709 PyObject *substr, /* Substring to find */
1710 Py_ssize_t start, /* Start index */
1711 Py_ssize_t end, /* Stop index */
1712 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001713 );
1714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715/* Like PyUnicode_Find, but search for single character only. */
1716PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1717 PyObject *str,
1718 Py_UCS4 ch,
1719 Py_ssize_t start,
1720 Py_ssize_t end,
1721 int direction
1722 );
1723
Barry Warsaw51ac5802000-03-20 16:36:48 +00001724/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001725
Martin v. Löwis18e16552006-02-15 17:27:45 +00001726PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 PyObject *str, /* String */
1728 PyObject *substr, /* Substring to count */
1729 Py_ssize_t start, /* Start index */
1730 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001731 );
1732
Barry Warsaw51ac5802000-03-20 16:36:48 +00001733/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001734 and return the resulting Unicode object. */
1735
Mark Hammond91a681d2002-08-12 07:21:58 +00001736PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 PyObject *str, /* String */
1738 PyObject *substr, /* Substring to find */
1739 PyObject *replstr, /* Substring to replace */
1740 Py_ssize_t maxcount /* Max. number of replacements to apply;
1741 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001742 );
1743
1744/* Compare two strings and return -1, 0, 1 for less than, equal,
1745 greater than resp. */
1746
Mark Hammond91a681d2002-08-12 07:21:58 +00001747PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 PyObject *left, /* Left string */
1749 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001750 );
1751
Martin v. Löwis5b222132007-06-10 09:51:05 +00001752PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1753 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001754 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001755 );
1756
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001757/* Rich compare two strings and return one of the following:
1758
1759 - NULL in case an exception was raised
1760 - Py_True or Py_False for successfuly comparisons
1761 - Py_NotImplemented in case the type combination is unknown
1762
1763 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1764 case the conversion of the arguments to Unicode fails with a
1765 UnicodeDecodeError.
1766
1767 Possible values for op:
1768
1769 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1770
1771*/
1772
1773PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001774 PyObject *left, /* Left string */
1775 PyObject *right, /* Right string */
1776 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001777 );
1778
Thomas Wouters7e474022000-07-16 12:04:32 +00001779/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001780 the resulting Unicode string. */
1781
Mark Hammond91a681d2002-08-12 07:21:58 +00001782PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001783 PyObject *format, /* Format string */
1784 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001785 );
1786
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001787/* Checks whether element is contained in container and return 1/0
1788 accordingly.
1789
1790 element has to coerce to an one element Unicode string. -1 is
1791 returned in case of an error. */
1792
Mark Hammond91a681d2002-08-12 07:21:58 +00001793PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001794 PyObject *container, /* Container string */
1795 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001796 );
1797
Martin v. Löwis47383402007-08-15 07:32:56 +00001798/* Checks whether argument is a valid identifier. */
1799
1800PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1801
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001802#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001803/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001804PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001805 PyUnicodeObject *self,
1806 int striptype,
1807 PyObject *sepobj
1808 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001809#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001810
Eric Smith5807c412008-05-11 21:00:57 +00001811/* Using the current locale, insert the thousands grouping
1812 into the string pointed to by buffer. For the argument descriptions,
1813 see Objects/stringlib/localeutil.h */
1814
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001815#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001816PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1817 Py_ssize_t n_buffer,
1818 Py_UNICODE *digits,
1819 Py_ssize_t n_digits,
1820 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001821#endif
Eric Smith5807c412008-05-11 21:00:57 +00001822
Eric Smitha3b1ac82009-04-03 14:45:06 +00001823/* Using explicit passed-in values, insert the thousands grouping
1824 into the string pointed to by buffer. For the argument descriptions,
1825 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001826#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1828 int kind,
1829 void *buffer,
1830 Py_ssize_t n_buffer,
1831 void *digits,
1832 Py_ssize_t n_digits,
1833 Py_ssize_t min_width,
1834 const char *grouping,
1835 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001836#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001837/* === Characters Type APIs =============================================== */
1838
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001839/* Helper array used by Py_UNICODE_ISSPACE(). */
1840
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001841#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001842PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1843
Guido van Rossumd8225182000-03-10 22:33:05 +00001844/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001845 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001846
1847 These APIs are implemented in Objects/unicodectype.c.
1848
1849*/
1850
Mark Hammond91a681d2002-08-12 07:21:58 +00001851PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001852 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001853 );
1854
Mark Hammond91a681d2002-08-12 07:21:58 +00001855PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001856 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001857 );
1858
Mark Hammond91a681d2002-08-12 07:21:58 +00001859PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001860 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001861 );
1862
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001863PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001864 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001865 );
1866
1867PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001868 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001869 );
1870
Mark Hammond91a681d2002-08-12 07:21:58 +00001871PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001872 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001873 );
1874
Mark Hammond91a681d2002-08-12 07:21:58 +00001875PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001876 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001877 );
1878
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001879PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1880 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001881 );
1882
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001883PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1884 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001885 );
1886
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001887PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1888 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001889 );
1890
Mark Hammond91a681d2002-08-12 07:21:58 +00001891PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001892 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001893 );
1894
Mark Hammond91a681d2002-08-12 07:21:58 +00001895PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001896 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001897 );
1898
Mark Hammond91a681d2002-08-12 07:21:58 +00001899PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001900 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001901 );
1902
Mark Hammond91a681d2002-08-12 07:21:58 +00001903PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001904 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001905 );
1906
Mark Hammond91a681d2002-08-12 07:21:58 +00001907PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001908 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001909 );
1910
Mark Hammond91a681d2002-08-12 07:21:58 +00001911PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001912 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001913 );
1914
Georg Brandl559e5d72008-06-11 18:37:52 +00001915PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001916 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001917 );
1918
Mark Hammond91a681d2002-08-12 07:21:58 +00001919PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001920 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001921 );
1922
Victor Stinneref8d95c2010-08-16 22:03:11 +00001923PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1924 const Py_UNICODE *u
1925 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001926
1927PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001928 Py_UNICODE *s1,
1929 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001930
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001931PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1932 Py_UNICODE *s1, const Py_UNICODE *s2);
1933
Martin v. Löwis5b222132007-06-10 09:51:05 +00001934PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001935 Py_UNICODE *s1,
1936 const Py_UNICODE *s2,
1937 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001938
1939PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001940 const Py_UNICODE *s1,
1941 const Py_UNICODE *s2
1942 );
1943
1944PyAPI_FUNC(int) Py_UNICODE_strncmp(
1945 const Py_UNICODE *s1,
1946 const Py_UNICODE *s2,
1947 size_t n
1948 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001949
1950PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001951 const Py_UNICODE *s,
1952 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001953 );
1954
Victor Stinner331ea922010-08-10 16:37:20 +00001955PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001956 const Py_UNICODE *s,
1957 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001958 );
1959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001960PyAPI_FUNC(size_t) Py_UCS4_strlen(
1961 const Py_UCS4 *u
1962 );
1963
1964PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1965 Py_UCS4 *s1,
1966 const Py_UCS4 *s2);
1967
1968PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1969 Py_UCS4 *s1, const Py_UCS4 *s2);
1970
1971PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1972 Py_UCS4 *s1,
1973 const Py_UCS4 *s2,
1974 size_t n);
1975
1976PyAPI_FUNC(int) Py_UCS4_strcmp(
1977 const Py_UCS4 *s1,
1978 const Py_UCS4 *s2
1979 );
1980
1981PyAPI_FUNC(int) Py_UCS4_strncmp(
1982 const Py_UCS4 *s1,
1983 const Py_UCS4 *s2,
1984 size_t n
1985 );
1986
1987PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1988 const Py_UCS4 *s,
1989 Py_UCS4 c
1990 );
1991
1992PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1993 const Py_UCS4 *s,
1994 Py_UCS4 c
1995 );
1996
Victor Stinner71133ff2010-09-01 23:43:53 +00001997/* Create a copy of a unicode string ending with a nul character. Return NULL
1998 and raise a MemoryError exception on memory allocation failure, otherwise
1999 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2000
Victor Stinner46408602010-09-03 16:18:00 +00002001PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002002 PyObject *unicode
2003 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002004#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002005
Guido van Rossumd8225182000-03-10 22:33:05 +00002006#ifdef __cplusplus
2007}
2008#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002009#endif /* !Py_UNICODEOBJECT_H */