blob: 1b6d1c94e12745d7608a3059f550b14fd61af43f [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner910337b2011-10-03 03:20:16 +0200209 /* Unicode strings can be in 4 states:
210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200227 * ascii = 0
Victor Stinner85041a52011-10-03 14:42:39 +0200228 * utf8 != data
Victor Stinner910337b2011-10-03 03:20:16 +0200229
230 - string created by the legacy API (not ready):
231
232 * structure = PyUnicodeObject
233 * kind = PyUnicode_WCHAR_KIND
234 * compact = 0
235 * ready = 0
236 * wstr is not NULL
237 * data.any is NULL
238 * utf8 is NULL
239 * interned = SSTATE_NOT_INTERNED
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200241
242 - string created by the legacy API, ready:
243
244 * structure = PyUnicodeObject structure
245 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
246 PyUnicode_4BYTE_KIND
247 * compact = 0
248 * ready = 1
249 * data.any is not NULL
Victor Stinner85041a52011-10-03 14:42:39 +0200250 * utf8 = data if ascii is 1
Victor Stinner910337b2011-10-03 03:20:16 +0200251
252 String created by the legacy API becomes ready when calling
253 PyUnicode_READY().
254
255 See also _PyUnicode_CheckConsistency(). */
Guido van Rossumd8225182000-03-10 22:33:05 +0000256 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200257 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000258 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200259 struct {
260 /*
261 SSTATE_NOT_INTERNED (0)
262 SSTATE_INTERNED_MORTAL (1)
263 SSTATE_INTERNED_IMMORTAL (2)
264
265 If interned != SSTATE_NOT_INTERNED, the two references from the
266 dictionary to this object are *not* counted in ob_refcnt.
267 */
268 unsigned int interned:2;
269 /* Character size:
270
271 PyUnicode_WCHAR_KIND (0): wchar_t*
272 PyUnicode_1BYTE_KIND (1): Py_UCS1*
273 PyUnicode_2BYTE_KIND (2): Py_UCS2*
274 PyUnicode_4BYTE_KIND (3): Py_UCS4*
275 */
276 unsigned int kind:2;
277 /* Compact is with respect to the allocation scheme. Compact unicode
278 objects only require one memory block while non-compact objects use
279 one block for the PyUnicodeObject struct and another for its data
280 buffer. */
281 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200282 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
283 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
284 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285 unsigned int ascii:1;
286 /* The ready flag indicates whether the object layout is initialized
287 completely. This means that this is either a compact object, or
288 the data pointer is filled out. The bit is redundant, and helps
289 to minimize the test in PyUnicode_IS_READY(). */
290 unsigned int ready:1;
291 } state;
292 wchar_t *wstr; /* wchar_t representation (null-terminated) */
293} PyASCIIObject;
294
295/* Non-ASCII strings allocated through PyUnicode_New use the
296 PyCompactUnicodeOject structure. state.compact is set, and the data
297 immediately follow the structure. */
298typedef struct {
299 PyASCIIObject _base;
300 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
301 * terminating \0. */
302 char *utf8; /* UTF-8 representation (null-terminated) */
303 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
304 * surrogates count as two code points. */
305} PyCompactUnicodeObject;
306
307/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
308 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200309 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200310typedef struct {
311 PyCompactUnicodeObject _base;
312 union {
313 void *any;
314 Py_UCS1 *latin1;
315 Py_UCS2 *ucs2;
316 Py_UCS4 *ucs4;
317 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000318} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000319#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000320
Mark Hammond91a681d2002-08-12 07:21:58 +0000321PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000322PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000323
Thomas Wouters27d517b2007-02-25 20:39:11 +0000324#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000325 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
326#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000327
328/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000329#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330
331#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200332 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 ((PyASCIIObject*)op)->length : \
334 ((PyCompactUnicodeObject*)op)->wstr_length)
335
336/* Returns the deprecated Py_UNICODE representation's size in code units
337 (this includes surrogate pairs as 2 units).
338 If the Py_UNICODE representation is not available, it will be computed
339 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
340
Guido van Rossumd8225182000-03-10 22:33:05 +0000341#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 (assert(PyUnicode_Check(op)), \
343 (((PyASCIIObject *)(op))->wstr) ? \
344 PyUnicode_WSTR_LENGTH(op) : \
345 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
346 PyUnicode_WSTR_LENGTH(op)))
347
Guido van Rossumd8225182000-03-10 22:33:05 +0000348#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200349 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
350
351/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
352 representation on demand. Using this macro is very inefficient now,
353 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
354 use PyUnicode_WRITE() and PyUnicode_READ(). */
355
Guido van Rossumd8225182000-03-10 22:33:05 +0000356#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357 (assert(PyUnicode_Check(op)), \
358 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
359 PyUnicode_AsUnicode((PyObject *)(op)))
360
Guido van Rossumd8225182000-03-10 22:33:05 +0000361#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 ((const char *)(PyUnicode_AS_UNICODE(op)))
363
364
365/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
366
367/* Values for PyUnicodeObject.state: */
368
369/* Interning state. */
370#define SSTATE_NOT_INTERNED 0
371#define SSTATE_INTERNED_MORTAL 1
372#define SSTATE_INTERNED_IMMORTAL 2
373
Victor Stinnera3b334d2011-10-03 13:53:37 +0200374/* Return true if the string contains only ASCII characters, or 0 if not. The
375 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
376 or Ready calls are performed. */
377#define PyUnicode_IS_ASCII(op) \
378 (((PyASCIIObject*)op)->state.ascii)
379
380/* Return true if the string is compact or 0 if not.
381 No type checks or Ready calls are performed. */
382#define PyUnicode_IS_COMPACT(op) \
383 (((PyASCIIObject*)(op))->state.compact)
384
385/* Return true if the string is a compact ASCII string (use PyASCIIObject
386 structure), or 0 if not. No type checks or Ready calls are performed. */
387#define PyUnicode_IS_COMPACT_ASCII(op) \
388 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200389
390/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200391 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 has not been called yet. */
393#define PyUnicode_WCHAR_KIND 0
394
395/* Return values of the PyUnicode_KIND() macro: */
396
397#define PyUnicode_1BYTE_KIND 1
398#define PyUnicode_2BYTE_KIND 2
399#define PyUnicode_4BYTE_KIND 3
400
401
402/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200403 this can be 1, 2 or 4.
404
405 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406#define PyUnicode_CHARACTER_SIZE(op) \
407 (1 << (PyUnicode_KIND(op) - 1))
408
409/* Return pointers to the canonical representation casted as unsigned char,
410 Py_UCS2, or Py_UCS4 for direct character access.
411 No checks are performed, use PyUnicode_CHARACTER_SIZE or
412 PyUnicode_KIND() before to ensure these will work correctly. */
413
414#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
415#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
416#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
417
Victor Stinner157f83f2011-09-28 21:41:31 +0200418/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419#define PyUnicode_KIND(op) \
420 (assert(PyUnicode_Check(op)), \
421 assert(PyUnicode_IS_READY(op)), \
422 ((PyASCIIObject *)(op))->state.kind)
423
Victor Stinner157f83f2011-09-28 21:41:31 +0200424/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200425#define _PyUnicode_COMPACT_DATA(op) \
426 (PyUnicode_IS_COMPACT_ASCII(op) ? \
427 ((void*)((PyASCIIObject*)(op) + 1)) : \
428 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
429
430#define _PyUnicode_NONCOMPACT_DATA(op) \
431 (assert(((PyUnicodeObject*)(op))->data.any), \
432 ((((PyUnicodeObject *)(op))->data.any)))
433
434#define PyUnicode_DATA(op) \
435 (assert(PyUnicode_Check(op)), \
436 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
437 _PyUnicode_NONCOMPACT_DATA(op))
438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200440 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441
Victor Stinner4584a5b2011-10-01 02:39:37 +0200442 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
444
445/* In the access macros below, "kind" may be evaluated more than once.
446 All other macro parameters are evaluated exactly once, so it is safe
447 to put side effects into them (such as increasing the index). */
448
449/* Write into the canonical representation, this macro does not do any sanity
450 checks and is intended for usage in loops. The caller should cache the
451 kind and data pointers optained form other macro calls.
452 index is the index in the string (starts at 0) and value is the new
453 code point value which shoule be written to that location. */
454#define PyUnicode_WRITE(kind, data, index, value) \
455 do { \
456 switch ((kind)) { \
457 case PyUnicode_1BYTE_KIND: { \
458 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
459 break; \
460 } \
461 case PyUnicode_2BYTE_KIND: { \
462 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
463 break; \
464 } \
465 default: { \
466 assert((kind) == PyUnicode_4BYTE_KIND); \
467 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
468 } \
469 } \
470 } while (0)
471
472/* Read a code point form the string's canonical representation. No checks
473 or ready calls are performed. */
474#define PyUnicode_READ(kind, data, index) \
475 ((Py_UCS4) \
476 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200477 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478 ((kind) == PyUnicode_2BYTE_KIND ? \
479 ((const Py_UCS2 *)(data))[(index)] : \
480 ((const Py_UCS4 *)(data))[(index)] \
481 ) \
482 ))
483
484/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
485 calls PyUnicode_KIND() and might call it twice. For single reads, use
486 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
487 cache kind and use PyUnicode_READ instead. */
488#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200489 (assert(PyUnicode_Check(unicode)), \
490 assert(PyUnicode_IS_READY(unicode)), \
491 (Py_UCS4) \
492 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
493 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
494 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
495 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
496 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
497 ) \
498 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499
500/* Returns the length of the unicode string. The caller has to make sure that
501 the string has it's canonical representation set before calling
502 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
503#define PyUnicode_GET_LENGTH(op) \
504 (assert(PyUnicode_Check(op)), \
505 assert(PyUnicode_IS_READY(op)), \
506 ((PyASCIIObject *)(op))->length)
507
508
509/* Fast check to determine whether an object is ready. Equivalent to
510 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
511
512#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
513
Victor Stinnera3b334d2011-10-03 13:53:37 +0200514/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200516 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200517 Returns 0 on success and -1 on errors. */
518#define PyUnicode_READY(op) \
519 (assert(PyUnicode_Check(op)), \
520 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200521 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523/* Return a maximum character value which is suitable for creating another
524 string based on op. This is always an approximation but more efficient
525 than interating over the string. */
526#define PyUnicode_MAX_CHAR_VALUE(op) \
527 (assert(PyUnicode_IS_READY(op)), \
528 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
529 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
530 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
531 (0x7fU) : (0xffU) \
532 ) : \
533 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
534 (0xffffU) : (0x10ffffU) \
535 ))))
536
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000537#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000538
539/* --- Constants ---------------------------------------------------------- */
540
541/* This Unicode character will be used as replacement character during
542 decoding if the errors argument is set to "replace". Note: the
543 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
544 Unicode 3.0. */
545
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200546#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000547
548/* === Public API ========================================================= */
549
550/* --- Plain Py_UNICODE --------------------------------------------------- */
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552/* With PEP 393, this is the recommended way to allocate a new unicode object.
553 This function will allocate the object and its buffer in a single memory
554 block. Objects created using this function are not resizable. */
555#ifndef Py_LIMITED_API
556PyAPI_FUNC(PyObject*) PyUnicode_New(
557 Py_ssize_t size, /* Number of code points in the new string */
558 Py_UCS4 maxchar /* maximum code point value in the string */
559 );
560#endif
561
Victor Stinnerd8f65102011-09-29 19:43:17 +0200562/* Initializes the canonical string representation from a the deprecated
563 wstr/Py_UNICODE representation. This function is used to convert Unicode
564 objects which were created using the old API to the new flexible format
565 introduced with PEP 393.
566
567 Don't call this function directly, use the public PyUnicode_READY() macro
568 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569#ifndef Py_LIMITED_API
570PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200571 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572 );
573#endif
574
Victor Stinner034f6cf2011-09-30 02:26:44 +0200575/* Get a copy of a Unicode string. */
576PyAPI_FUNC(PyObject*) PyUnicode_Copy(
577 PyObject *unicode
578 );
579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200580/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200581 character conversion when necessary and falls back to memcpy if possible.
582
Victor Stinnera0702ab2011-09-29 14:14:38 +0200583 Fail if to is too small (smaller than how_many or smaller than
584 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
585 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200586
587 Return the number of written character, or return -1 and raise an exception
588 on error.
589
590 Pseudo-code:
591
592 how_many = min(how_many, len(from) - from_start)
593 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
594 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200595
596 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200597 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200599PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200600 PyObject *to,
601 Py_ssize_t to_start,
602 PyObject *from,
603 Py_ssize_t from_start,
604 Py_ssize_t how_many
605 );
606#endif
607
Guido van Rossumd8225182000-03-10 22:33:05 +0000608/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000609 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000610
611 u may be NULL which causes the contents to be undefined. It is the
612 user's responsibility to fill in the needed data afterwards. Note
613 that modifying the Unicode object contents after construction is
614 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000615
616 The buffer is copied into the new object. */
617
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000618#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000619PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000620 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000621 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000622 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000623#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000624
Georg Brandl952867a2010-06-27 10:17:12 +0000625/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000627 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000628 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000629 );
630
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000631/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000633PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000634 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000635 );
636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifndef Py_LIMITED_API
638PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
639 int kind,
640 const void *buffer,
641 Py_ssize_t size);
642#endif
643
644PyAPI_FUNC(PyObject*) PyUnicode_Substring(
645 PyObject *str,
646 Py_ssize_t start,
647 Py_ssize_t end);
648
649/* Copy the string into a UCS4 buffer including the null character is copy_null
650 is set. Return NULL and raise an exception on error. Raise a ValueError if
651 the buffer is smaller than the string. Return buffer on success.
652
653 buflen is the length of the buffer in (Py_UCS4) characters. */
654PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
655 PyObject *unicode,
656 Py_UCS4* buffer,
657 Py_ssize_t buflen,
658 int copy_null);
659
660/* Copy the string into a UCS4 buffer. A new buffer is allocated using
661 * PyMem_Malloc; if this fails, NULL is returned with a memory error
662 exception set. */
663PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
664
Guido van Rossumd8225182000-03-10 22:33:05 +0000665/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 Py_UNICODE buffer.
667 If the wchar_t/Py_UNICODE representation is not yet available, this
668 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000669
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000670#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000671PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000673 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000674#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676/* Return a read-only pointer to the Unicode object's internal
677 Py_UNICODE buffer and save the length at size.
678 If the wchar_t/Py_UNICODE representation is not yet available, this
679 function will calculate it. */
680
681#ifndef Py_LIMITED_API
682PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
683 PyObject *unicode, /* Unicode object */
684 Py_ssize_t *size /* location where to save the length */
685 );
686#endif
687
Guido van Rossumd8225182000-03-10 22:33:05 +0000688/* Get the length of the Unicode object. */
689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
691 PyObject *unicode
692);
693
Victor Stinner157f83f2011-09-28 21:41:31 +0200694/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200695 string representation. */
696
Martin v. Löwis18e16552006-02-15 17:27:45 +0000697PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000699 );
700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701/* Read a character from the string. */
702
703PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
704 PyObject *unicode,
705 Py_ssize_t index
706 );
707
708/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200709 PyUnicode_New, must not be shared, and must not have been hashed yet.
710
711 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712
713PyAPI_FUNC(int) PyUnicode_WriteChar(
714 PyObject *unicode,
715 Py_ssize_t index,
716 Py_UCS4 character
717 );
718
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000719#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000720/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000721PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000722#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000723
Guido van Rossum52c23592000-04-10 13:41:41 +0000724/* Resize an already allocated Unicode object to the new size length.
725
726 *unicode is modified to point to the new (resized) object and 0
727 returned on success.
728
729 This API may only be called by the function which also called the
730 Unicode constructor. The refcount on the object must be 1. Otherwise,
731 an error is returned.
732
733 Error handling is implemented as follows: an exception is set, -1
734 is returned and *unicode left untouched.
735
736*/
737
Mark Hammond91a681d2002-08-12 07:21:58 +0000738PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyObject **unicode, /* Pointer to the Unicode object */
740 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000741 );
742
Guido van Rossumd8225182000-03-10 22:33:05 +0000743/* Coerce obj to an Unicode object and return a reference with
744 *incremented* refcount.
745
746 Coercion is done in the following way:
747
Georg Brandl952867a2010-06-27 10:17:12 +0000748 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000749 under the assumptions that they contain data using the UTF-8
750 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000751
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000752 2. All other objects (including Unicode objects) raise an
753 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000754
755 The API returns NULL in case of an error. The caller is responsible
756 for decref'ing the returned objects.
757
758*/
759
Mark Hammond91a681d2002-08-12 07:21:58 +0000760PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000761 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000762 const char *encoding, /* encoding */
763 const char *errors /* error handling */
764 );
765
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000766/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000767 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000769 Unicode objects are passed back as-is (subclasses are converted to
770 true Unicode objects), all other objects are delegated to
771 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000772 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000773
774 The API returns NULL in case of an error. The caller is responsible
775 for decref'ing the returned objects.
776
777*/
778
Mark Hammond91a681d2002-08-12 07:21:58 +0000779PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000781 );
782
Victor Stinner1205f272010-09-11 00:54:47 +0000783PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
784 const char *format, /* ASCII-encoded string */
785 va_list vargs
786 );
787PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
788 const char *format, /* ASCII-encoded string */
789 ...
790 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000791
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000792#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000793/* Format the object based on the format_spec, as defined in PEP 3101
794 (Advanced String Formatting). */
795PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 PyObject *format_spec,
797 Py_ssize_t start,
798 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000799#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000800
Walter Dörwald16807132007-05-25 13:52:07 +0000801PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
802PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000803PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
804 const char *u /* UTF-8 encoded string */
805 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000806#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000807PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000808#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000809
810/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811#define PyUnicode_CHECK_INTERNED(op) \
812 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000813
Guido van Rossumd8225182000-03-10 22:33:05 +0000814/* --- wchar_t support for platforms which support it --------------------- */
815
816#ifdef HAVE_WCHAR_H
817
Georg Brandl952867a2010-06-27 10:17:12 +0000818/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000819 size.
820
821 The buffer is copied into the new object. */
822
Mark Hammond91a681d2002-08-12 07:21:58 +0000823PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000824 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000825 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000826 );
827
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000828/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000829 most size wchar_t characters are copied.
830
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000831 Note that the resulting wchar_t string may or may not be
832 0-terminated. It is the responsibility of the caller to make sure
833 that the wchar_t string is 0-terminated in case this is required by
834 the application.
835
836 Returns the number of wchar_t characters copied (excluding a
837 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000838 error. */
839
Martin v. Löwis18e16552006-02-15 17:27:45 +0000840PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000841 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000842 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000843 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000844 );
845
Victor Stinner137c34c2010-09-29 10:25:54 +0000846/* Convert the Unicode object to a wide character string. The output string
847 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200848 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000849
850 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
851 on success. On error, returns NULL, *size is undefined and raises a
852 MemoryError. */
853
854PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000855 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000856 Py_ssize_t *size /* number of characters of the result */
857 );
858
Victor Stinner9f789e72011-10-01 03:57:28 +0200859#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200861#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Guido van Rossumd8225182000-03-10 22:33:05 +0000863#endif
864
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000865/* --- Unicode ordinals --------------------------------------------------- */
866
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867/* Create a Unicode Object from the given Unicode code point ordinal.
868
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000869 The ordinal must be in range(0x10000) on narrow Python builds
870 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
871 raised in case it is not.
872
873*/
874
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000875PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000876
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000877/* --- Free-list management ----------------------------------------------- */
878
879/* Clear the free list used by the Unicode implementation.
880
881 This can be used to release memory used for objects on the free
882 list back to the Python memory allocator.
883
884*/
885
886PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
887
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000889
890 Many of these APIs take two arguments encoding and errors. These
891 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000892 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000893
Georg Brandl952867a2010-06-27 10:17:12 +0000894 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000895
896 Error handling is set by errors which may also be set to NULL
897 meaning to use the default handling defined for the codec. Default
898 error handling for all builtin codecs is "strict" (ValueErrors are
899 raised).
900
901 The codecs all use a similar interface. Only deviation from the
902 generic ones are documented.
903
904*/
905
Fred Drakecb093fe2000-05-09 19:51:53 +0000906/* --- Manage the default encoding ---------------------------------------- */
907
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000908/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000909 Unicode object unicode and the size of the encoded representation
910 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000911
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000912 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000913
Victor Stinner157f83f2011-09-28 21:41:31 +0200914 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 and subsequent calls will return the same string. The memory is relased
916 when the unicodeobject is deallocated.
917
918 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
919 support the previous internal function with the same behaviour.
920
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000921 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000922 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000923
924 *** If you need to access the Unicode object as UTF-8 bytes string,
925 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000926*/
927
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000928#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000931 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000933#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000934
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000935/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000936 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
939 in the unicodeobject.
940
941 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
942 support the previous internal function with the same behaviour.
943
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000944 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000945 extracted from the returned data.
946
947 *** This API is for interpreter INTERNAL USE ONLY and will likely
948 *** be removed or changed for Python 3.1.
949
950 *** If you need to access the Unicode object as UTF-8 bytes string,
951 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000952
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000953*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000954
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000955#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
957#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000958#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000959
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000960/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000961
Mark Hammond91a681d2002-08-12 07:21:58 +0000962PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000963
Guido van Rossumd8225182000-03-10 22:33:05 +0000964/* --- Generic Codecs ----------------------------------------------------- */
965
966/* Create a Unicode object by decoding the encoded string s of the
967 given size. */
968
Mark Hammond91a681d2002-08-12 07:21:58 +0000969PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000970 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000971 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000972 const char *encoding, /* encoding */
973 const char *errors /* error handling */
974 );
975
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000976/* Decode a Unicode object unicode and return the result as Python
977 object. */
978
979PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000980 PyObject *unicode, /* Unicode object */
981 const char *encoding, /* encoding */
982 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000983 );
984
985/* Decode a Unicode object unicode and return the result as Unicode
986 object. */
987
988PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989 PyObject *unicode, /* Unicode object */
990 const char *encoding, /* encoding */
991 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000992 );
993
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000994/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000995 Python string object. */
996
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000997#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001001 const char *encoding, /* encoding */
1002 const char *errors /* error handling */
1003 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001004#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001005
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001006/* Encodes a Unicode object and returns the result as Python
1007 object. */
1008
1009PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001010 PyObject *unicode, /* Unicode object */
1011 const char *encoding, /* encoding */
1012 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001013 );
1014
Guido van Rossumd8225182000-03-10 22:33:05 +00001015/* Encodes a Unicode object and returns the result as Python string
1016 object. */
1017
Mark Hammond91a681d2002-08-12 07:21:58 +00001018PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019 PyObject *unicode, /* Unicode object */
1020 const char *encoding, /* encoding */
1021 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001022 );
1023
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001024/* Encodes a Unicode object and returns the result as Unicode
1025 object. */
1026
1027PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001028 PyObject *unicode, /* Unicode object */
1029 const char *encoding, /* encoding */
1030 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001031 );
1032
1033/* Build an encoding map. */
1034
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001035PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1036 PyObject* string /* 256 character map */
1037 );
1038
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001039/* --- UTF-7 Codecs ------------------------------------------------------- */
1040
Mark Hammond91a681d2002-08-12 07:21:58 +00001041PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 const char *string, /* UTF-7 encoded string */
1043 Py_ssize_t length, /* size of string */
1044 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001045 );
1046
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001047PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 const char *string, /* UTF-7 encoded string */
1049 Py_ssize_t length, /* size of string */
1050 const char *errors, /* error handling */
1051 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001052 );
1053
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001054#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001055PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 const Py_UNICODE *data, /* Unicode char buffer */
1057 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1058 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1059 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1060 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001062#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063
Guido van Rossumd8225182000-03-10 22:33:05 +00001064/* --- UTF-8 Codecs ------------------------------------------------------- */
1065
Mark Hammond91a681d2002-08-12 07:21:58 +00001066PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001067 const char *string, /* UTF-8 encoded string */
1068 Py_ssize_t length, /* size of string */
1069 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001070 );
1071
Walter Dörwald69652032004-09-07 20:24:22 +00001072PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 const char *string, /* UTF-8 encoded string */
1074 Py_ssize_t length, /* size of string */
1075 const char *errors, /* error handling */
1076 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001077 );
1078
Mark Hammond91a681d2002-08-12 07:21:58 +00001079PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001080 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001081 );
1082
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001083#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1085 PyObject *unicode,
1086 const char *errors);
1087
Mark Hammond91a681d2002-08-12 07:21:58 +00001088PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 const Py_UNICODE *data, /* Unicode char buffer */
1090 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1091 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001092 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001093#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001094
Walter Dörwald41980ca2007-08-16 21:55:45 +00001095/* --- UTF-32 Codecs ------------------------------------------------------ */
1096
1097/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1098 the corresponding Unicode object.
1099
1100 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001102
1103 If byteorder is non-NULL, the decoder starts decoding using the
1104 given byte order:
1105
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 *byteorder == -1: little endian
1107 *byteorder == 0: native order
1108 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001109
1110 In native mode, the first four bytes of the stream are checked for a
1111 BOM mark. If found, the BOM mark is analysed, the byte order
1112 adjusted and the BOM skipped. In the other modes, no BOM mark
1113 interpretation is done. After completion, *byteorder is set to the
1114 current byte order at the end of input data.
1115
1116 If byteorder is NULL, the codec starts in native order mode.
1117
1118*/
1119
1120PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 const char *string, /* UTF-32 encoded string */
1122 Py_ssize_t length, /* size of string */
1123 const char *errors, /* error handling */
1124 int *byteorder /* pointer to byteorder to use
1125 0=native;-1=LE,1=BE; updated on
1126 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001127 );
1128
1129PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 const char *string, /* UTF-32 encoded string */
1131 Py_ssize_t length, /* size of string */
1132 const char *errors, /* error handling */
1133 int *byteorder, /* pointer to byteorder to use
1134 0=native;-1=LE,1=BE; updated on
1135 exit */
1136 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001137 );
1138
1139/* Returns a Python string using the UTF-32 encoding in native byte
1140 order. The string always starts with a BOM mark. */
1141
1142PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001144 );
1145
1146/* Returns a Python string object holding the UTF-32 encoded value of
1147 the Unicode data.
1148
1149 If byteorder is not 0, output is written according to the following
1150 byte order:
1151
1152 byteorder == -1: little endian
1153 byteorder == 0: native byte order (writes a BOM mark)
1154 byteorder == 1: big endian
1155
1156 If byteorder is 0, the output string will always start with the
1157 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1158 prepended.
1159
1160*/
1161
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001162#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001163PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 const Py_UNICODE *data, /* Unicode char buffer */
1165 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1166 const char *errors, /* error handling */
1167 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001168 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001169#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001170
Guido van Rossumd8225182000-03-10 22:33:05 +00001171/* --- UTF-16 Codecs ------------------------------------------------------ */
1172
Guido van Rossum9e896b32000-04-05 20:11:21 +00001173/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001174 the corresponding Unicode object.
1175
1176 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001178
1179 If byteorder is non-NULL, the decoder starts decoding using the
1180 given byte order:
1181
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001182 *byteorder == -1: little endian
1183 *byteorder == 0: native order
1184 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001185
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001186 In native mode, the first two bytes of the stream are checked for a
1187 BOM mark. If found, the BOM mark is analysed, the byte order
1188 adjusted and the BOM skipped. In the other modes, no BOM mark
1189 interpretation is done. After completion, *byteorder is set to the
1190 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001191
1192 If byteorder is NULL, the codec starts in native order mode.
1193
1194*/
1195
Mark Hammond91a681d2002-08-12 07:21:58 +00001196PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 const char *string, /* UTF-16 encoded string */
1198 Py_ssize_t length, /* size of string */
1199 const char *errors, /* error handling */
1200 int *byteorder /* pointer to byteorder to use
1201 0=native;-1=LE,1=BE; updated on
1202 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001203 );
1204
Walter Dörwald69652032004-09-07 20:24:22 +00001205PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 const char *string, /* UTF-16 encoded string */
1207 Py_ssize_t length, /* size of string */
1208 const char *errors, /* error handling */
1209 int *byteorder, /* pointer to byteorder to use
1210 0=native;-1=LE,1=BE; updated on
1211 exit */
1212 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001213 );
1214
Guido van Rossumd8225182000-03-10 22:33:05 +00001215/* Returns a Python string using the UTF-16 encoding in native byte
1216 order. The string always starts with a BOM mark. */
1217
Mark Hammond91a681d2002-08-12 07:21:58 +00001218PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001220 );
1221
1222/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001223 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001224
1225 If byteorder is not 0, output is written according to the following
1226 byte order:
1227
1228 byteorder == -1: little endian
1229 byteorder == 0: native byte order (writes a BOM mark)
1230 byteorder == 1: big endian
1231
1232 If byteorder is 0, the output string will always start with the
1233 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1234 prepended.
1235
1236 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1237 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001238 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
1240*/
1241
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001242#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001243PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 const Py_UNICODE *data, /* Unicode char buffer */
1245 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1246 const char *errors, /* error handling */
1247 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001248 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001249#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001250
1251/* --- Unicode-Escape Codecs ---------------------------------------------- */
1252
Mark Hammond91a681d2002-08-12 07:21:58 +00001253PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 const char *string, /* Unicode-Escape encoded string */
1255 Py_ssize_t length, /* size of string */
1256 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001257 );
1258
Mark Hammond91a681d2002-08-12 07:21:58 +00001259PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001261 );
1262
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001263#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001264PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 const Py_UNICODE *data, /* Unicode char buffer */
1266 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001267 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001268#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001269
1270/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1271
Mark Hammond91a681d2002-08-12 07:21:58 +00001272PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 const char *string, /* Raw-Unicode-Escape encoded string */
1274 Py_ssize_t length, /* size of string */
1275 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001276 );
1277
Mark Hammond91a681d2002-08-12 07:21:58 +00001278PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001280 );
1281
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001282#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001283PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 const Py_UNICODE *data, /* Unicode char buffer */
1285 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001286 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001287#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001288
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001289/* --- Unicode Internal Codec ---------------------------------------------
1290
1291 Only for internal use in _codecsmodule.c */
1292
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001294PyObject *_PyUnicode_DecodeUnicodeInternal(
1295 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001296 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001297 const char *errors
1298 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001299#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001300
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001302
1303 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1304
1305*/
1306
Mark Hammond91a681d2002-08-12 07:21:58 +00001307PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 const char *string, /* Latin-1 encoded string */
1309 Py_ssize_t length, /* size of string */
1310 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001311 );
1312
Mark Hammond91a681d2002-08-12 07:21:58 +00001313PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001315 );
1316
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001317#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1319 PyObject* unicode,
1320 const char* errors);
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 const Py_UNICODE *data, /* Unicode char buffer */
1324 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1325 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001327#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001328
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001330
1331 Only 7-bit ASCII data is excepted. All other codes generate errors.
1332
1333*/
1334
Mark Hammond91a681d2002-08-12 07:21:58 +00001335PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 const char *string, /* ASCII encoded string */
1337 Py_ssize_t length, /* size of string */
1338 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001339 );
1340
Mark Hammond91a681d2002-08-12 07:21:58 +00001341PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001343 );
1344
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001345#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1347 PyObject* unicode,
1348 const char* errors);
1349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 const Py_UNICODE *data, /* Unicode char buffer */
1352 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1353 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001354 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001355#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001356
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001358
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001360
1361 Decoding mappings must map single string characters to single
1362 Unicode characters, integers (which are then interpreted as Unicode
1363 ordinals) or None (meaning "undefined mapping" and causing an
1364 error).
1365
1366 Encoding mappings must map single Unicode characters to single
1367 string characters, integers (which are then interpreted as Latin-1
1368 ordinals) or None (meaning "undefined mapping" and causing an
1369 error).
1370
1371 If a character lookup fails with a LookupError, the character is
1372 copied as-is meaning that its ordinal value will be interpreted as
1373 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1374 to contain those mappings which map characters to different code
1375 points.
1376
1377*/
1378
Mark Hammond91a681d2002-08-12 07:21:58 +00001379PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 const char *string, /* Encoded string */
1381 Py_ssize_t length, /* size of string */
1382 PyObject *mapping, /* character mapping
1383 (char ordinal -> unicode ordinal) */
1384 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001385 );
1386
Mark Hammond91a681d2002-08-12 07:21:58 +00001387PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 PyObject *unicode, /* Unicode object */
1389 PyObject *mapping /* character mapping
1390 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001391 );
1392
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001393#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001394PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 const Py_UNICODE *data, /* Unicode char buffer */
1396 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1397 PyObject *mapping, /* character mapping
1398 (unicode ordinal -> char ordinal) */
1399 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001400 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001401#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001402
1403/* Translate a Py_UNICODE buffer of the given length by applying a
1404 character mapping table to it and return the resulting Unicode
1405 object.
1406
1407 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001409
1410 Mapping tables may be dictionaries or sequences. Unmapped character
1411 ordinals (ones which cause a LookupError) are left untouched and
1412 are copied as-is.
1413
1414*/
1415
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001416#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001417PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 const Py_UNICODE *data, /* Unicode char buffer */
1419 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1420 PyObject *table, /* Translate table */
1421 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001422 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001423#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001424
Victor Stinner99b95382011-07-04 14:23:54 +02001425#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001426
Guido van Rossumefec1152000-03-28 02:01:15 +00001427/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001428
Mark Hammond91a681d2002-08-12 07:21:58 +00001429PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001430 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001431 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001432 const char *errors /* error handling */
1433 );
1434
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001435PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1436 const char *string, /* MBCS encoded string */
1437 Py_ssize_t length, /* size of string */
1438 const char *errors, /* error handling */
1439 Py_ssize_t *consumed /* bytes consumed */
1440 );
1441
Mark Hammond91a681d2002-08-12 07:21:58 +00001442PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001443 PyObject *unicode /* Unicode object */
1444 );
1445
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001446#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001447PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001448 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001449 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001450 const char *errors /* error handling */
1451 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001452#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001453
Victor Stinner99b95382011-07-04 14:23:54 +02001454#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001455
Guido van Rossum9e896b32000-04-05 20:11:21 +00001456/* --- Decimal Encoder ---------------------------------------------------- */
1457
1458/* Takes a Unicode string holding a decimal value and writes it into
1459 an output buffer using standard ASCII digit codes.
1460
1461 The output buffer has to provide at least length+1 bytes of storage
1462 area. The output string is 0-terminated.
1463
1464 The encoder converts whitespace to ' ', decimal characters to their
1465 corresponding ASCII digit and all other Latin-1 characters except
1466 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1467 are treated as errors. This includes embedded NULL bytes.
1468
1469 Error handling is defined by the errors argument:
1470
1471 NULL or "strict": raise a ValueError
1472 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001474 "replace": replaces illegal characters with '?'
1475
1476 Returns 0 on success, -1 on failure.
1477
1478*/
1479
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001480#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001481PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 Py_UNICODE *s, /* Unicode buffer */
1483 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1484 char *output, /* Output buffer; must have size >= length */
1485 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001486 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001487#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001488
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001489/* Transforms code points that have decimal digit property to the
1490 corresponding ASCII digit code points.
1491
1492 Returns a new Unicode string on success, NULL on failure.
1493*/
1494
Georg Brandlb5503082010-12-05 11:40:48 +00001495#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001496PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1497 Py_UNICODE *s, /* Unicode buffer */
1498 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1499 );
Georg Brandlb5503082010-12-05 11:40:48 +00001500#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1503 as argument instead of a raw buffer and length. This function additionally
1504 transforms spaces to ASCII because this is what the callers in longobject,
1505 floatobject, and complexobject did anyways. */
1506
1507#ifndef Py_LIMITED_API
1508PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1509 PyObject *unicode /* Unicode object */
1510 );
1511#endif
1512
Martin v. Löwis011e8422009-05-05 04:43:17 +00001513/* --- File system encoding ---------------------------------------------- */
1514
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001515/* ParseTuple converter: encode str objects to bytes using
1516 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001517
1518PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1519
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001520/* ParseTuple converter: decode bytes objects to unicode using
1521 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1522
1523PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1524
Victor Stinner77c38622010-05-14 15:58:55 +00001525/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1526 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001527
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001528 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1529 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001530
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001531 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001532*/
1533
1534PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1535 const char *s /* encoded string */
1536 );
1537
Victor Stinner77c38622010-05-14 15:58:55 +00001538/* Decode a string using Py_FileSystemDefaultEncoding
1539 and the "surrogateescape" error handler.
1540
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001541 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1542 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001543*/
1544
Martin v. Löwis011e8422009-05-05 04:43:17 +00001545PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1546 const char *s, /* encoded string */
1547 Py_ssize_t size /* size */
1548 );
1549
Victor Stinnerae6265f2010-05-15 16:27:27 +00001550/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001551 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001552
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001553 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1554 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001555*/
1556
1557PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1558 PyObject *unicode
1559 );
1560
Guido van Rossumd8225182000-03-10 22:33:05 +00001561/* --- Methods & Slots ----------------------------------------------------
1562
1563 These are capable of handling Unicode objects and strings on input
1564 (we refer to them as strings in the descriptions) and return
1565 Unicode objects or integers as apporpriate. */
1566
1567/* Concat two strings giving a new Unicode string. */
1568
Mark Hammond91a681d2002-08-12 07:21:58 +00001569PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001570 PyObject *left, /* Left string */
1571 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001572 );
1573
Walter Dörwald1ab83302007-05-18 17:15:44 +00001574/* Concat two strings and put the result in *pleft
1575 (sets *pleft to NULL on error) */
1576
1577PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001578 PyObject **pleft, /* Pointer to left string */
1579 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001580 );
1581
1582/* Concat two strings, put the result in *pleft and drop the right object
1583 (sets *pleft to NULL on error) */
1584
1585PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001586 PyObject **pleft, /* Pointer to left string */
1587 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001588 );
1589
Guido van Rossumd8225182000-03-10 22:33:05 +00001590/* Split a string giving a list of Unicode strings.
1591
1592 If sep is NULL, splitting will be done at all whitespace
1593 substrings. Otherwise, splits occur at the given separator.
1594
1595 At most maxsplit splits will be done. If negative, no limit is set.
1596
1597 Separators are not included in the resulting list.
1598
1599*/
1600
Mark Hammond91a681d2002-08-12 07:21:58 +00001601PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001602 PyObject *s, /* String to split */
1603 PyObject *sep, /* String separator */
1604 Py_ssize_t maxsplit /* Maxsplit count */
1605 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001606
1607/* Dito, but split at line breaks.
1608
1609 CRLF is considered to be one line break. Line breaks are not
1610 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611
Mark Hammond91a681d2002-08-12 07:21:58 +00001612PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 PyObject *s, /* String to split */
1614 int keepends /* If true, line end markers are included */
1615 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001616
Thomas Wouters477c8d52006-05-27 19:21:47 +00001617/* Partition a string using a given separator. */
1618
1619PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620 PyObject *s, /* String to partition */
1621 PyObject *sep /* String separator */
1622 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001623
1624/* Partition a string using a given separator, searching from the end of the
1625 string. */
1626
1627PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 PyObject *s, /* String to partition */
1629 PyObject *sep /* String separator */
1630 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001631
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001632/* Split a string giving a list of Unicode strings.
1633
1634 If sep is NULL, splitting will be done at all whitespace
1635 substrings. Otherwise, splits occur at the given separator.
1636
1637 At most maxsplit splits will be done. But unlike PyUnicode_Split
1638 PyUnicode_RSplit splits from the end of the string. If negative,
1639 no limit is set.
1640
1641 Separators are not included in the resulting list.
1642
1643*/
1644
1645PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001646 PyObject *s, /* String to split */
1647 PyObject *sep, /* String separator */
1648 Py_ssize_t maxsplit /* Maxsplit count */
1649 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001650
Guido van Rossumd8225182000-03-10 22:33:05 +00001651/* Translate a string by applying a character mapping table to it and
1652 return the resulting Unicode object.
1653
1654 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001656
1657 Mapping tables may be dictionaries or sequences. Unmapped character
1658 ordinals (ones which cause a LookupError) are left untouched and
1659 are copied as-is.
1660
1661*/
1662
Mark Hammond91a681d2002-08-12 07:21:58 +00001663PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001664 PyObject *str, /* String */
1665 PyObject *table, /* Translate table */
1666 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001667 );
1668
1669/* Join a sequence of strings using the given separator and return
1670 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671
Mark Hammond91a681d2002-08-12 07:21:58 +00001672PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001673 PyObject *separator, /* Separator string */
1674 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001675 );
1676
1677/* Return 1 if substr matches str[start:end] at the given tail end, 0
1678 otherwise. */
1679
Martin v. Löwis18e16552006-02-15 17:27:45 +00001680PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 PyObject *str, /* String */
1682 PyObject *substr, /* Prefix or Suffix string */
1683 Py_ssize_t start, /* Start index */
1684 Py_ssize_t end, /* Stop index */
1685 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001686 );
1687
1688/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001689 given search direction or -1 if not found. -2 is returned in case
1690 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001691
Martin v. Löwis18e16552006-02-15 17:27:45 +00001692PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 PyObject *str, /* String */
1694 PyObject *substr, /* Substring to find */
1695 Py_ssize_t start, /* Start index */
1696 Py_ssize_t end, /* Stop index */
1697 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001698 );
1699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700/* Like PyUnicode_Find, but search for single character only. */
1701PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1702 PyObject *str,
1703 Py_UCS4 ch,
1704 Py_ssize_t start,
1705 Py_ssize_t end,
1706 int direction
1707 );
1708
Barry Warsaw51ac5802000-03-20 16:36:48 +00001709/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001710
Martin v. Löwis18e16552006-02-15 17:27:45 +00001711PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001712 PyObject *str, /* String */
1713 PyObject *substr, /* Substring to count */
1714 Py_ssize_t start, /* Start index */
1715 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001716 );
1717
Barry Warsaw51ac5802000-03-20 16:36:48 +00001718/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001719 and return the resulting Unicode object. */
1720
Mark Hammond91a681d2002-08-12 07:21:58 +00001721PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 PyObject *str, /* String */
1723 PyObject *substr, /* Substring to find */
1724 PyObject *replstr, /* Substring to replace */
1725 Py_ssize_t maxcount /* Max. number of replacements to apply;
1726 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001727 );
1728
1729/* Compare two strings and return -1, 0, 1 for less than, equal,
1730 greater than resp. */
1731
Mark Hammond91a681d2002-08-12 07:21:58 +00001732PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 PyObject *left, /* Left string */
1734 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001735 );
1736
Martin v. Löwis5b222132007-06-10 09:51:05 +00001737PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1738 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001739 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001740 );
1741
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001742/* Rich compare two strings and return one of the following:
1743
1744 - NULL in case an exception was raised
1745 - Py_True or Py_False for successfuly comparisons
1746 - Py_NotImplemented in case the type combination is unknown
1747
1748 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1749 case the conversion of the arguments to Unicode fails with a
1750 UnicodeDecodeError.
1751
1752 Possible values for op:
1753
1754 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1755
1756*/
1757
1758PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 PyObject *left, /* Left string */
1760 PyObject *right, /* Right string */
1761 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001762 );
1763
Thomas Wouters7e474022000-07-16 12:04:32 +00001764/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001765 the resulting Unicode string. */
1766
Mark Hammond91a681d2002-08-12 07:21:58 +00001767PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 PyObject *format, /* Format string */
1769 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001770 );
1771
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001772/* Checks whether element is contained in container and return 1/0
1773 accordingly.
1774
1775 element has to coerce to an one element Unicode string. -1 is
1776 returned in case of an error. */
1777
Mark Hammond91a681d2002-08-12 07:21:58 +00001778PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 PyObject *container, /* Container string */
1780 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001781 );
1782
Martin v. Löwis47383402007-08-15 07:32:56 +00001783/* Checks whether argument is a valid identifier. */
1784
1785PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1786
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001787#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001788/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001789PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001790 PyUnicodeObject *self,
1791 int striptype,
1792 PyObject *sepobj
1793 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001794#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001795
Eric Smith5807c412008-05-11 21:00:57 +00001796/* Using the current locale, insert the thousands grouping
1797 into the string pointed to by buffer. For the argument descriptions,
1798 see Objects/stringlib/localeutil.h */
1799
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001800#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001801PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1802 Py_ssize_t n_buffer,
1803 Py_UNICODE *digits,
1804 Py_ssize_t n_digits,
1805 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001806#endif
Eric Smith5807c412008-05-11 21:00:57 +00001807
Eric Smitha3b1ac82009-04-03 14:45:06 +00001808/* Using explicit passed-in values, insert the thousands grouping
1809 into the string pointed to by buffer. For the argument descriptions,
1810 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001811#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1813 int kind,
1814 void *buffer,
1815 Py_ssize_t n_buffer,
1816 void *digits,
1817 Py_ssize_t n_digits,
1818 Py_ssize_t min_width,
1819 const char *grouping,
1820 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001821#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001822/* === Characters Type APIs =============================================== */
1823
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001824/* Helper array used by Py_UNICODE_ISSPACE(). */
1825
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001826#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001827PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1828
Guido van Rossumd8225182000-03-10 22:33:05 +00001829/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001830 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001831
1832 These APIs are implemented in Objects/unicodectype.c.
1833
1834*/
1835
Mark Hammond91a681d2002-08-12 07:21:58 +00001836PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001837 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001838 );
1839
Mark Hammond91a681d2002-08-12 07:21:58 +00001840PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001841 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001842 );
1843
Mark Hammond91a681d2002-08-12 07:21:58 +00001844PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001845 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001846 );
1847
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001848PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001849 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001850 );
1851
1852PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001853 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001854 );
1855
Mark Hammond91a681d2002-08-12 07:21:58 +00001856PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001857 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001858 );
1859
Mark Hammond91a681d2002-08-12 07:21:58 +00001860PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001861 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001862 );
1863
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001864PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1865 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001866 );
1867
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001868PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1869 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001870 );
1871
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001872PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1873 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001874 );
1875
Mark Hammond91a681d2002-08-12 07:21:58 +00001876PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001877 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001878 );
1879
Mark Hammond91a681d2002-08-12 07:21:58 +00001880PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001881 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001882 );
1883
Mark Hammond91a681d2002-08-12 07:21:58 +00001884PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001885 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001886 );
1887
Mark Hammond91a681d2002-08-12 07:21:58 +00001888PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001889 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001890 );
1891
Mark Hammond91a681d2002-08-12 07:21:58 +00001892PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001893 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001894 );
1895
Mark Hammond91a681d2002-08-12 07:21:58 +00001896PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001897 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001898 );
1899
Georg Brandl559e5d72008-06-11 18:37:52 +00001900PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001901 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001902 );
1903
Mark Hammond91a681d2002-08-12 07:21:58 +00001904PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001905 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001906 );
1907
Victor Stinneref8d95c2010-08-16 22:03:11 +00001908PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1909 const Py_UNICODE *u
1910 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001911
1912PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001913 Py_UNICODE *s1,
1914 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001915
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001916PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1917 Py_UNICODE *s1, const Py_UNICODE *s2);
1918
Martin v. Löwis5b222132007-06-10 09:51:05 +00001919PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001920 Py_UNICODE *s1,
1921 const Py_UNICODE *s2,
1922 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001923
1924PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001925 const Py_UNICODE *s1,
1926 const Py_UNICODE *s2
1927 );
1928
1929PyAPI_FUNC(int) Py_UNICODE_strncmp(
1930 const Py_UNICODE *s1,
1931 const Py_UNICODE *s2,
1932 size_t n
1933 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001934
1935PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001936 const Py_UNICODE *s,
1937 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001938 );
1939
Victor Stinner331ea922010-08-10 16:37:20 +00001940PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001941 const Py_UNICODE *s,
1942 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001943 );
1944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945PyAPI_FUNC(size_t) Py_UCS4_strlen(
1946 const Py_UCS4 *u
1947 );
1948
1949PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1950 Py_UCS4 *s1,
1951 const Py_UCS4 *s2);
1952
1953PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1954 Py_UCS4 *s1, const Py_UCS4 *s2);
1955
1956PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1957 Py_UCS4 *s1,
1958 const Py_UCS4 *s2,
1959 size_t n);
1960
1961PyAPI_FUNC(int) Py_UCS4_strcmp(
1962 const Py_UCS4 *s1,
1963 const Py_UCS4 *s2
1964 );
1965
1966PyAPI_FUNC(int) Py_UCS4_strncmp(
1967 const Py_UCS4 *s1,
1968 const Py_UCS4 *s2,
1969 size_t n
1970 );
1971
1972PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1973 const Py_UCS4 *s,
1974 Py_UCS4 c
1975 );
1976
1977PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1978 const Py_UCS4 *s,
1979 Py_UCS4 c
1980 );
1981
Victor Stinner71133ff2010-09-01 23:43:53 +00001982/* Create a copy of a unicode string ending with a nul character. Return NULL
1983 and raise a MemoryError exception on memory allocation failure, otherwise
1984 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1985
Victor Stinner46408602010-09-03 16:18:00 +00001986PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001987 PyObject *unicode
1988 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001989#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001990
Guido van Rossumd8225182000-03-10 22:33:05 +00001991#ifdef __cplusplus
1992}
1993#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001994#endif /* !Py_UNICODEOBJECT_H */