blob: d7c9fa773c7652e21da029c6324cd854bc03c904 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner910337b2011-10-03 03:20:16 +0200209 /* Unicode strings can be in 4 states:
210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200227 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200228
229 - string created by the legacy API (not ready):
230
231 * structure = PyUnicodeObject
232 * kind = PyUnicode_WCHAR_KIND
233 * compact = 0
234 * ready = 0
235 * wstr is not NULL
236 * data.any is NULL
237 * utf8 is NULL
238 * interned = SSTATE_NOT_INTERNED
Victor Stinnera3b334d2011-10-03 13:53:37 +0200239 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200240
241 - string created by the legacy API, ready:
242
243 * structure = PyUnicodeObject structure
244 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
245 PyUnicode_4BYTE_KIND
246 * compact = 0
247 * ready = 1
248 * data.any is not NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200249
250 String created by the legacy API becomes ready when calling
251 PyUnicode_READY().
252
253 See also _PyUnicode_CheckConsistency(). */
Guido van Rossumd8225182000-03-10 22:33:05 +0000254 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200255 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000256 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200257 struct {
258 /*
259 SSTATE_NOT_INTERNED (0)
260 SSTATE_INTERNED_MORTAL (1)
261 SSTATE_INTERNED_IMMORTAL (2)
262
263 If interned != SSTATE_NOT_INTERNED, the two references from the
264 dictionary to this object are *not* counted in ob_refcnt.
265 */
266 unsigned int interned:2;
267 /* Character size:
268
269 PyUnicode_WCHAR_KIND (0): wchar_t*
270 PyUnicode_1BYTE_KIND (1): Py_UCS1*
271 PyUnicode_2BYTE_KIND (2): Py_UCS2*
272 PyUnicode_4BYTE_KIND (3): Py_UCS4*
273 */
274 unsigned int kind:2;
275 /* Compact is with respect to the allocation scheme. Compact unicode
276 objects only require one memory block while non-compact objects use
277 one block for the PyUnicodeObject struct and another for its data
278 buffer. */
279 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200280 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
281 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
282 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200283 unsigned int ascii:1;
284 /* The ready flag indicates whether the object layout is initialized
285 completely. This means that this is either a compact object, or
286 the data pointer is filled out. The bit is redundant, and helps
287 to minimize the test in PyUnicode_IS_READY(). */
288 unsigned int ready:1;
289 } state;
290 wchar_t *wstr; /* wchar_t representation (null-terminated) */
291} PyASCIIObject;
292
293/* Non-ASCII strings allocated through PyUnicode_New use the
294 PyCompactUnicodeOject structure. state.compact is set, and the data
295 immediately follow the structure. */
296typedef struct {
297 PyASCIIObject _base;
298 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
299 * terminating \0. */
300 char *utf8; /* UTF-8 representation (null-terminated) */
301 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
302 * surrogates count as two code points. */
303} PyCompactUnicodeObject;
304
305/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
306 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200307 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200308typedef struct {
309 PyCompactUnicodeObject _base;
310 union {
311 void *any;
312 Py_UCS1 *latin1;
313 Py_UCS2 *ucs2;
314 Py_UCS4 *ucs4;
315 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000316} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000317#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000318
Mark Hammond91a681d2002-08-12 07:21:58 +0000319PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000320PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000321
Thomas Wouters27d517b2007-02-25 20:39:11 +0000322#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000323 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
324#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000325
326/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000327#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200328
329#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200330 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200331 ((PyASCIIObject*)op)->length : \
332 ((PyCompactUnicodeObject*)op)->wstr_length)
333
334/* Returns the deprecated Py_UNICODE representation's size in code units
335 (this includes surrogate pairs as 2 units).
336 If the Py_UNICODE representation is not available, it will be computed
337 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
338
Guido van Rossumd8225182000-03-10 22:33:05 +0000339#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 (assert(PyUnicode_Check(op)), \
341 (((PyASCIIObject *)(op))->wstr) ? \
342 PyUnicode_WSTR_LENGTH(op) : \
343 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
344 PyUnicode_WSTR_LENGTH(op)))
345
Guido van Rossumd8225182000-03-10 22:33:05 +0000346#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
348
349/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
350 representation on demand. Using this macro is very inefficient now,
351 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
352 use PyUnicode_WRITE() and PyUnicode_READ(). */
353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 (assert(PyUnicode_Check(op)), \
356 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
357 PyUnicode_AsUnicode((PyObject *)(op)))
358
Guido van Rossumd8225182000-03-10 22:33:05 +0000359#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200360 ((const char *)(PyUnicode_AS_UNICODE(op)))
361
362
363/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
364
365/* Values for PyUnicodeObject.state: */
366
367/* Interning state. */
368#define SSTATE_NOT_INTERNED 0
369#define SSTATE_INTERNED_MORTAL 1
370#define SSTATE_INTERNED_IMMORTAL 2
371
Victor Stinnera3b334d2011-10-03 13:53:37 +0200372/* Return true if the string contains only ASCII characters, or 0 if not. The
373 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
374 or Ready calls are performed. */
375#define PyUnicode_IS_ASCII(op) \
376 (((PyASCIIObject*)op)->state.ascii)
377
378/* Return true if the string is compact or 0 if not.
379 No type checks or Ready calls are performed. */
380#define PyUnicode_IS_COMPACT(op) \
381 (((PyASCIIObject*)(op))->state.compact)
382
383/* Return true if the string is a compact ASCII string (use PyASCIIObject
384 structure), or 0 if not. No type checks or Ready calls are performed. */
385#define PyUnicode_IS_COMPACT_ASCII(op) \
386 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200387
388/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200389 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200390 has not been called yet. */
391#define PyUnicode_WCHAR_KIND 0
392
393/* Return values of the PyUnicode_KIND() macro: */
394
395#define PyUnicode_1BYTE_KIND 1
396#define PyUnicode_2BYTE_KIND 2
397#define PyUnicode_4BYTE_KIND 3
398
399
400/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200401 this can be 1, 2 or 4.
402
403 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404#define PyUnicode_CHARACTER_SIZE(op) \
405 (1 << (PyUnicode_KIND(op) - 1))
406
407/* Return pointers to the canonical representation casted as unsigned char,
408 Py_UCS2, or Py_UCS4 for direct character access.
409 No checks are performed, use PyUnicode_CHARACTER_SIZE or
410 PyUnicode_KIND() before to ensure these will work correctly. */
411
412#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
413#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
414#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
415
Victor Stinner157f83f2011-09-28 21:41:31 +0200416/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200417#define PyUnicode_KIND(op) \
418 (assert(PyUnicode_Check(op)), \
419 assert(PyUnicode_IS_READY(op)), \
420 ((PyASCIIObject *)(op))->state.kind)
421
Victor Stinner157f83f2011-09-28 21:41:31 +0200422/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423#define _PyUnicode_COMPACT_DATA(op) \
424 (PyUnicode_IS_COMPACT_ASCII(op) ? \
425 ((void*)((PyASCIIObject*)(op) + 1)) : \
426 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
427
428#define _PyUnicode_NONCOMPACT_DATA(op) \
429 (assert(((PyUnicodeObject*)(op))->data.any), \
430 ((((PyUnicodeObject *)(op))->data.any)))
431
432#define PyUnicode_DATA(op) \
433 (assert(PyUnicode_Check(op)), \
434 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
435 _PyUnicode_NONCOMPACT_DATA(op))
436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200437/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200438 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439
Victor Stinner4584a5b2011-10-01 02:39:37 +0200440 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
442
443/* In the access macros below, "kind" may be evaluated more than once.
444 All other macro parameters are evaluated exactly once, so it is safe
445 to put side effects into them (such as increasing the index). */
446
447/* Write into the canonical representation, this macro does not do any sanity
448 checks and is intended for usage in loops. The caller should cache the
449 kind and data pointers optained form other macro calls.
450 index is the index in the string (starts at 0) and value is the new
451 code point value which shoule be written to that location. */
452#define PyUnicode_WRITE(kind, data, index, value) \
453 do { \
454 switch ((kind)) { \
455 case PyUnicode_1BYTE_KIND: { \
456 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
457 break; \
458 } \
459 case PyUnicode_2BYTE_KIND: { \
460 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
461 break; \
462 } \
463 default: { \
464 assert((kind) == PyUnicode_4BYTE_KIND); \
465 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
466 } \
467 } \
468 } while (0)
469
470/* Read a code point form the string's canonical representation. No checks
471 or ready calls are performed. */
472#define PyUnicode_READ(kind, data, index) \
473 ((Py_UCS4) \
474 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200475 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476 ((kind) == PyUnicode_2BYTE_KIND ? \
477 ((const Py_UCS2 *)(data))[(index)] : \
478 ((const Py_UCS4 *)(data))[(index)] \
479 ) \
480 ))
481
482/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
483 calls PyUnicode_KIND() and might call it twice. For single reads, use
484 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
485 cache kind and use PyUnicode_READ instead. */
486#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200487 (assert(PyUnicode_Check(unicode)), \
488 assert(PyUnicode_IS_READY(unicode)), \
489 (Py_UCS4) \
490 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
491 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
492 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
493 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
494 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
495 ) \
496 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200497
498/* Returns the length of the unicode string. The caller has to make sure that
499 the string has it's canonical representation set before calling
500 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
501#define PyUnicode_GET_LENGTH(op) \
502 (assert(PyUnicode_Check(op)), \
503 assert(PyUnicode_IS_READY(op)), \
504 ((PyASCIIObject *)(op))->length)
505
506
507/* Fast check to determine whether an object is ready. Equivalent to
508 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
509
510#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
511
Victor Stinnera3b334d2011-10-03 13:53:37 +0200512/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200514 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515 Returns 0 on success and -1 on errors. */
516#define PyUnicode_READY(op) \
517 (assert(PyUnicode_Check(op)), \
518 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200519 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521/* Return a maximum character value which is suitable for creating another
522 string based on op. This is always an approximation but more efficient
523 than interating over the string. */
524#define PyUnicode_MAX_CHAR_VALUE(op) \
525 (assert(PyUnicode_IS_READY(op)), \
526 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
527 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
528 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
529 (0x7fU) : (0xffU) \
530 ) : \
531 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
532 (0xffffU) : (0x10ffffU) \
533 ))))
534
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000535#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000536
537/* --- Constants ---------------------------------------------------------- */
538
539/* This Unicode character will be used as replacement character during
540 decoding if the errors argument is set to "replace". Note: the
541 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
542 Unicode 3.0. */
543
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200544#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000545
546/* === Public API ========================================================= */
547
548/* --- Plain Py_UNICODE --------------------------------------------------- */
549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550/* With PEP 393, this is the recommended way to allocate a new unicode object.
551 This function will allocate the object and its buffer in a single memory
552 block. Objects created using this function are not resizable. */
553#ifndef Py_LIMITED_API
554PyAPI_FUNC(PyObject*) PyUnicode_New(
555 Py_ssize_t size, /* Number of code points in the new string */
556 Py_UCS4 maxchar /* maximum code point value in the string */
557 );
558#endif
559
Victor Stinnerd8f65102011-09-29 19:43:17 +0200560/* Initializes the canonical string representation from a the deprecated
561 wstr/Py_UNICODE representation. This function is used to convert Unicode
562 objects which were created using the old API to the new flexible format
563 introduced with PEP 393.
564
565 Don't call this function directly, use the public PyUnicode_READY() macro
566 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567#ifndef Py_LIMITED_API
568PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200569 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570 );
571#endif
572
Victor Stinner034f6cf2011-09-30 02:26:44 +0200573/* Get a copy of a Unicode string. */
574PyAPI_FUNC(PyObject*) PyUnicode_Copy(
575 PyObject *unicode
576 );
577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200578/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200579 character conversion when necessary and falls back to memcpy if possible.
580
Victor Stinnera0702ab2011-09-29 14:14:38 +0200581 Fail if to is too small (smaller than how_many or smaller than
582 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
583 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200584
585 Return the number of written character, or return -1 and raise an exception
586 on error.
587
588 Pseudo-code:
589
590 how_many = min(how_many, len(from) - from_start)
591 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
592 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200593
594 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200595 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200597PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598 PyObject *to,
599 Py_ssize_t to_start,
600 PyObject *from,
601 Py_ssize_t from_start,
602 Py_ssize_t how_many
603 );
604#endif
605
Guido van Rossumd8225182000-03-10 22:33:05 +0000606/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000608
609 u may be NULL which causes the contents to be undefined. It is the
610 user's responsibility to fill in the needed data afterwards. Note
611 that modifying the Unicode object contents after construction is
612 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000613
614 The buffer is copied into the new object. */
615
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000616#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000617PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000618 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000619 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000620 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000621#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000622
Georg Brandl952867a2010-06-27 10:17:12 +0000623/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000624PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000625 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000626 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 );
628
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000629/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000631PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000632 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000633 );
634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635#ifndef Py_LIMITED_API
636PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
637 int kind,
638 const void *buffer,
639 Py_ssize_t size);
640#endif
641
642PyAPI_FUNC(PyObject*) PyUnicode_Substring(
643 PyObject *str,
644 Py_ssize_t start,
645 Py_ssize_t end);
646
647/* Copy the string into a UCS4 buffer including the null character is copy_null
648 is set. Return NULL and raise an exception on error. Raise a ValueError if
649 the buffer is smaller than the string. Return buffer on success.
650
651 buflen is the length of the buffer in (Py_UCS4) characters. */
652PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
653 PyObject *unicode,
654 Py_UCS4* buffer,
655 Py_ssize_t buflen,
656 int copy_null);
657
658/* Copy the string into a UCS4 buffer. A new buffer is allocated using
659 * PyMem_Malloc; if this fails, NULL is returned with a memory error
660 exception set. */
661PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
662
Guido van Rossumd8225182000-03-10 22:33:05 +0000663/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 Py_UNICODE buffer.
665 If the wchar_t/Py_UNICODE representation is not yet available, this
666 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000667
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000668#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000669PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000671 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000672#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674/* Return a read-only pointer to the Unicode object's internal
675 Py_UNICODE buffer and save the length at size.
676 If the wchar_t/Py_UNICODE representation is not yet available, this
677 function will calculate it. */
678
679#ifndef Py_LIMITED_API
680PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
681 PyObject *unicode, /* Unicode object */
682 Py_ssize_t *size /* location where to save the length */
683 );
684#endif
685
Guido van Rossumd8225182000-03-10 22:33:05 +0000686/* Get the length of the Unicode object. */
687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
689 PyObject *unicode
690);
691
Victor Stinner157f83f2011-09-28 21:41:31 +0200692/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200693 string representation. */
694
Martin v. Löwis18e16552006-02-15 17:27:45 +0000695PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000697 );
698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200699/* Read a character from the string. */
700
701PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
702 PyObject *unicode,
703 Py_ssize_t index
704 );
705
706/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200707 PyUnicode_New, must not be shared, and must not have been hashed yet.
708
709 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710
711PyAPI_FUNC(int) PyUnicode_WriteChar(
712 PyObject *unicode,
713 Py_ssize_t index,
714 Py_UCS4 character
715 );
716
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000717#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000718/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000719PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000720#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000721
Guido van Rossum52c23592000-04-10 13:41:41 +0000722/* Resize an already allocated Unicode object to the new size length.
723
724 *unicode is modified to point to the new (resized) object and 0
725 returned on success.
726
727 This API may only be called by the function which also called the
728 Unicode constructor. The refcount on the object must be 1. Otherwise,
729 an error is returned.
730
731 Error handling is implemented as follows: an exception is set, -1
732 is returned and *unicode left untouched.
733
734*/
735
Mark Hammond91a681d2002-08-12 07:21:58 +0000736PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 PyObject **unicode, /* Pointer to the Unicode object */
738 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000739 );
740
Guido van Rossumd8225182000-03-10 22:33:05 +0000741/* Coerce obj to an Unicode object and return a reference with
742 *incremented* refcount.
743
744 Coercion is done in the following way:
745
Georg Brandl952867a2010-06-27 10:17:12 +0000746 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000747 under the assumptions that they contain data using the UTF-8
748 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000749
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000750 2. All other objects (including Unicode objects) raise an
751 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000752
753 The API returns NULL in case of an error. The caller is responsible
754 for decref'ing the returned objects.
755
756*/
757
Mark Hammond91a681d2002-08-12 07:21:58 +0000758PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000760 const char *encoding, /* encoding */
761 const char *errors /* error handling */
762 );
763
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000764/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000765 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000767 Unicode objects are passed back as-is (subclasses are converted to
768 true Unicode objects), all other objects are delegated to
769 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000770 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000771
772 The API returns NULL in case of an error. The caller is responsible
773 for decref'ing the returned objects.
774
775*/
776
Mark Hammond91a681d2002-08-12 07:21:58 +0000777PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000779 );
780
Victor Stinner1205f272010-09-11 00:54:47 +0000781PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
782 const char *format, /* ASCII-encoded string */
783 va_list vargs
784 );
785PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
786 const char *format, /* ASCII-encoded string */
787 ...
788 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000790#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000791/* Format the object based on the format_spec, as defined in PEP 3101
792 (Advanced String Formatting). */
793PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200794 PyObject *format_spec,
795 Py_ssize_t start,
796 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000797#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000798
Walter Dörwald16807132007-05-25 13:52:07 +0000799PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
800PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000801PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
802 const char *u /* UTF-8 encoded string */
803 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000804#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000805PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000806#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000807
808/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809#define PyUnicode_CHECK_INTERNED(op) \
810 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000811
Guido van Rossumd8225182000-03-10 22:33:05 +0000812/* --- wchar_t support for platforms which support it --------------------- */
813
814#ifdef HAVE_WCHAR_H
815
Georg Brandl952867a2010-06-27 10:17:12 +0000816/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000817 size.
818
819 The buffer is copied into the new object. */
820
Mark Hammond91a681d2002-08-12 07:21:58 +0000821PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000823 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000824 );
825
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000826/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000827 most size wchar_t characters are copied.
828
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000829 Note that the resulting wchar_t string may or may not be
830 0-terminated. It is the responsibility of the caller to make sure
831 that the wchar_t string is 0-terminated in case this is required by
832 the application.
833
834 Returns the number of wchar_t characters copied (excluding a
835 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000836 error. */
837
Martin v. Löwis18e16552006-02-15 17:27:45 +0000838PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000839 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000840 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000841 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000842 );
843
Victor Stinner137c34c2010-09-29 10:25:54 +0000844/* Convert the Unicode object to a wide character string. The output string
845 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200846 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000847
848 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
849 on success. On error, returns NULL, *size is undefined and raises a
850 MemoryError. */
851
852PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000853 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000854 Py_ssize_t *size /* number of characters of the result */
855 );
856
Victor Stinner9f789e72011-10-01 03:57:28 +0200857#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200859#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860
Guido van Rossumd8225182000-03-10 22:33:05 +0000861#endif
862
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000863/* --- Unicode ordinals --------------------------------------------------- */
864
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865/* Create a Unicode Object from the given Unicode code point ordinal.
866
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000867 The ordinal must be in range(0x10000) on narrow Python builds
868 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
869 raised in case it is not.
870
871*/
872
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000873PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000874
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000875/* --- Free-list management ----------------------------------------------- */
876
877/* Clear the free list used by the Unicode implementation.
878
879 This can be used to release memory used for objects on the free
880 list back to the Python memory allocator.
881
882*/
883
884PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
885
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000886/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000887
888 Many of these APIs take two arguments encoding and errors. These
889 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000890 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000891
Georg Brandl952867a2010-06-27 10:17:12 +0000892 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000893
894 Error handling is set by errors which may also be set to NULL
895 meaning to use the default handling defined for the codec. Default
896 error handling for all builtin codecs is "strict" (ValueErrors are
897 raised).
898
899 The codecs all use a similar interface. Only deviation from the
900 generic ones are documented.
901
902*/
903
Fred Drakecb093fe2000-05-09 19:51:53 +0000904/* --- Manage the default encoding ---------------------------------------- */
905
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000906/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000907 Unicode object unicode and the size of the encoded representation
908 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000909
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000910 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000911
Victor Stinner157f83f2011-09-28 21:41:31 +0200912 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913 and subsequent calls will return the same string. The memory is relased
914 when the unicodeobject is deallocated.
915
916 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
917 support the previous internal function with the same behaviour.
918
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000919 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000920 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000921
922 *** If you need to access the Unicode object as UTF-8 bytes string,
923 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000924*/
925
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000926#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000928 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000929 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000931#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000932
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000933/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000934 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
937 in the unicodeobject.
938
939 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
940 support the previous internal function with the same behaviour.
941
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000942 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000943 extracted from the returned data.
944
945 *** This API is for interpreter INTERNAL USE ONLY and will likely
946 *** be removed or changed for Python 3.1.
947
948 *** If you need to access the Unicode object as UTF-8 bytes string,
949 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000950
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000951*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000952
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000953#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
955#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000956#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000957
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000958/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000959
Mark Hammond91a681d2002-08-12 07:21:58 +0000960PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000961
Guido van Rossumd8225182000-03-10 22:33:05 +0000962/* --- Generic Codecs ----------------------------------------------------- */
963
964/* Create a Unicode object by decoding the encoded string s of the
965 given size. */
966
Mark Hammond91a681d2002-08-12 07:21:58 +0000967PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000968 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000969 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000970 const char *encoding, /* encoding */
971 const char *errors /* error handling */
972 );
973
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000974/* Decode a Unicode object unicode and return the result as Python
975 object. */
976
977PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000978 PyObject *unicode, /* Unicode object */
979 const char *encoding, /* encoding */
980 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000981 );
982
983/* Decode a Unicode object unicode and return the result as Unicode
984 object. */
985
986PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000987 PyObject *unicode, /* Unicode object */
988 const char *encoding, /* encoding */
989 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000990 );
991
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000992/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000993 Python string object. */
994
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000995#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000996PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000997 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000998 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const char *encoding, /* encoding */
1000 const char *errors /* error handling */
1001 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001002#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001003
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001004/* Encodes a Unicode object and returns the result as Python
1005 object. */
1006
1007PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 PyObject *unicode, /* Unicode object */
1009 const char *encoding, /* encoding */
1010 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001011 );
1012
Guido van Rossumd8225182000-03-10 22:33:05 +00001013/* Encodes a Unicode object and returns the result as Python string
1014 object. */
1015
Mark Hammond91a681d2002-08-12 07:21:58 +00001016PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 PyObject *unicode, /* Unicode object */
1018 const char *encoding, /* encoding */
1019 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001020 );
1021
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001022/* Encodes a Unicode object and returns the result as Unicode
1023 object. */
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001026 PyObject *unicode, /* Unicode object */
1027 const char *encoding, /* encoding */
1028 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001029 );
1030
1031/* Build an encoding map. */
1032
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001033PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1034 PyObject* string /* 256 character map */
1035 );
1036
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037/* --- UTF-7 Codecs ------------------------------------------------------- */
1038
Mark Hammond91a681d2002-08-12 07:21:58 +00001039PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 const char *string, /* UTF-7 encoded string */
1041 Py_ssize_t length, /* size of string */
1042 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001043 );
1044
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001045PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 const char *string, /* UTF-7 encoded string */
1047 Py_ssize_t length, /* size of string */
1048 const char *errors, /* error handling */
1049 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001050 );
1051
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001052#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001053PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054 const Py_UNICODE *data, /* Unicode char buffer */
1055 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1056 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1057 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1058 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001059 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001060#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061
Guido van Rossumd8225182000-03-10 22:33:05 +00001062/* --- UTF-8 Codecs ------------------------------------------------------- */
1063
Mark Hammond91a681d2002-08-12 07:21:58 +00001064PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 const char *string, /* UTF-8 encoded string */
1066 Py_ssize_t length, /* size of string */
1067 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001068 );
1069
Walter Dörwald69652032004-09-07 20:24:22 +00001070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const char *string, /* UTF-8 encoded string */
1072 Py_ssize_t length, /* size of string */
1073 const char *errors, /* error handling */
1074 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001075 );
1076
Mark Hammond91a681d2002-08-12 07:21:58 +00001077PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001079 );
1080
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001081#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1083 PyObject *unicode,
1084 const char *errors);
1085
Mark Hammond91a681d2002-08-12 07:21:58 +00001086PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 const Py_UNICODE *data, /* Unicode char buffer */
1088 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1089 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001090 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001091#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001092
Walter Dörwald41980ca2007-08-16 21:55:45 +00001093/* --- UTF-32 Codecs ------------------------------------------------------ */
1094
1095/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1096 the corresponding Unicode object.
1097
1098 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001100
1101 If byteorder is non-NULL, the decoder starts decoding using the
1102 given byte order:
1103
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001104 *byteorder == -1: little endian
1105 *byteorder == 0: native order
1106 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001107
1108 In native mode, the first four bytes of the stream are checked for a
1109 BOM mark. If found, the BOM mark is analysed, the byte order
1110 adjusted and the BOM skipped. In the other modes, no BOM mark
1111 interpretation is done. After completion, *byteorder is set to the
1112 current byte order at the end of input data.
1113
1114 If byteorder is NULL, the codec starts in native order mode.
1115
1116*/
1117
1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 const char *string, /* UTF-32 encoded string */
1120 Py_ssize_t length, /* size of string */
1121 const char *errors, /* error handling */
1122 int *byteorder /* pointer to byteorder to use
1123 0=native;-1=LE,1=BE; updated on
1124 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001125 );
1126
1127PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 const char *string, /* UTF-32 encoded string */
1129 Py_ssize_t length, /* size of string */
1130 const char *errors, /* error handling */
1131 int *byteorder, /* pointer to byteorder to use
1132 0=native;-1=LE,1=BE; updated on
1133 exit */
1134 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001135 );
1136
1137/* Returns a Python string using the UTF-32 encoding in native byte
1138 order. The string always starts with a BOM mark. */
1139
1140PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001142 );
1143
1144/* Returns a Python string object holding the UTF-32 encoded value of
1145 the Unicode data.
1146
1147 If byteorder is not 0, output is written according to the following
1148 byte order:
1149
1150 byteorder == -1: little endian
1151 byteorder == 0: native byte order (writes a BOM mark)
1152 byteorder == 1: big endian
1153
1154 If byteorder is 0, the output string will always start with the
1155 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1156 prepended.
1157
1158*/
1159
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001160#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001161PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 const Py_UNICODE *data, /* Unicode char buffer */
1163 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1164 const char *errors, /* error handling */
1165 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001166 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001167#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001168
Guido van Rossumd8225182000-03-10 22:33:05 +00001169/* --- UTF-16 Codecs ------------------------------------------------------ */
1170
Guido van Rossum9e896b32000-04-05 20:11:21 +00001171/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001172 the corresponding Unicode object.
1173
1174 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001176
1177 If byteorder is non-NULL, the decoder starts decoding using the
1178 given byte order:
1179
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 *byteorder == -1: little endian
1181 *byteorder == 0: native order
1182 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001183
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001184 In native mode, the first two bytes of the stream are checked for a
1185 BOM mark. If found, the BOM mark is analysed, the byte order
1186 adjusted and the BOM skipped. In the other modes, no BOM mark
1187 interpretation is done. After completion, *byteorder is set to the
1188 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001189
1190 If byteorder is NULL, the codec starts in native order mode.
1191
1192*/
1193
Mark Hammond91a681d2002-08-12 07:21:58 +00001194PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 const char *string, /* UTF-16 encoded string */
1196 Py_ssize_t length, /* size of string */
1197 const char *errors, /* error handling */
1198 int *byteorder /* pointer to byteorder to use
1199 0=native;-1=LE,1=BE; updated on
1200 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001201 );
1202
Walter Dörwald69652032004-09-07 20:24:22 +00001203PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 const char *string, /* UTF-16 encoded string */
1205 Py_ssize_t length, /* size of string */
1206 const char *errors, /* error handling */
1207 int *byteorder, /* pointer to byteorder to use
1208 0=native;-1=LE,1=BE; updated on
1209 exit */
1210 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001211 );
1212
Guido van Rossumd8225182000-03-10 22:33:05 +00001213/* Returns a Python string using the UTF-16 encoding in native byte
1214 order. The string always starts with a BOM mark. */
1215
Mark Hammond91a681d2002-08-12 07:21:58 +00001216PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001218 );
1219
1220/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001221 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001222
1223 If byteorder is not 0, output is written according to the following
1224 byte order:
1225
1226 byteorder == -1: little endian
1227 byteorder == 0: native byte order (writes a BOM mark)
1228 byteorder == 1: big endian
1229
1230 If byteorder is 0, the output string will always start with the
1231 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1232 prepended.
1233
1234 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1235 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001236 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001237
1238*/
1239
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001240#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001241PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 const Py_UNICODE *data, /* Unicode char buffer */
1243 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1244 const char *errors, /* error handling */
1245 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001246 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001247#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001248
1249/* --- Unicode-Escape Codecs ---------------------------------------------- */
1250
Mark Hammond91a681d2002-08-12 07:21:58 +00001251PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 const char *string, /* Unicode-Escape encoded string */
1253 Py_ssize_t length, /* size of string */
1254 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001255 );
1256
Mark Hammond91a681d2002-08-12 07:21:58 +00001257PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001259 );
1260
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001261#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001262PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 const Py_UNICODE *data, /* Unicode char buffer */
1264 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001265 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001266#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001267
1268/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1269
Mark Hammond91a681d2002-08-12 07:21:58 +00001270PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 const char *string, /* Raw-Unicode-Escape encoded string */
1272 Py_ssize_t length, /* size of string */
1273 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001274 );
1275
Mark Hammond91a681d2002-08-12 07:21:58 +00001276PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001278 );
1279
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001280#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001281PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 const Py_UNICODE *data, /* Unicode char buffer */
1283 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001286
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001287/* --- Unicode Internal Codec ---------------------------------------------
1288
1289 Only for internal use in _codecsmodule.c */
1290
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001291#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001292PyObject *_PyUnicode_DecodeUnicodeInternal(
1293 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001294 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001295 const char *errors
1296 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001298
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001300
1301 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1302
1303*/
1304
Mark Hammond91a681d2002-08-12 07:21:58 +00001305PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 const char *string, /* Latin-1 encoded string */
1307 Py_ssize_t length, /* size of string */
1308 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
1310
Mark Hammond91a681d2002-08-12 07:21:58 +00001311PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001313 );
1314
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001315#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1317 PyObject* unicode,
1318 const char* errors);
1319
Mark Hammond91a681d2002-08-12 07:21:58 +00001320PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 const Py_UNICODE *data, /* Unicode char buffer */
1322 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1323 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001324 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001325#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001326
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001327/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001328
1329 Only 7-bit ASCII data is excepted. All other codes generate errors.
1330
1331*/
1332
Mark Hammond91a681d2002-08-12 07:21:58 +00001333PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 const char *string, /* ASCII encoded string */
1335 Py_ssize_t length, /* size of string */
1336 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001337 );
1338
Mark Hammond91a681d2002-08-12 07:21:58 +00001339PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001341 );
1342
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001343#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1345 PyObject* unicode,
1346 const char* errors);
1347
Mark Hammond91a681d2002-08-12 07:21:58 +00001348PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 const Py_UNICODE *data, /* Unicode char buffer */
1350 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1351 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001352 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001353#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001354
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001356
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001358
1359 Decoding mappings must map single string characters to single
1360 Unicode characters, integers (which are then interpreted as Unicode
1361 ordinals) or None (meaning "undefined mapping" and causing an
1362 error).
1363
1364 Encoding mappings must map single Unicode characters to single
1365 string characters, integers (which are then interpreted as Latin-1
1366 ordinals) or None (meaning "undefined mapping" and causing an
1367 error).
1368
1369 If a character lookup fails with a LookupError, the character is
1370 copied as-is meaning that its ordinal value will be interpreted as
1371 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1372 to contain those mappings which map characters to different code
1373 points.
1374
1375*/
1376
Mark Hammond91a681d2002-08-12 07:21:58 +00001377PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 const char *string, /* Encoded string */
1379 Py_ssize_t length, /* size of string */
1380 PyObject *mapping, /* character mapping
1381 (char ordinal -> unicode ordinal) */
1382 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001383 );
1384
Mark Hammond91a681d2002-08-12 07:21:58 +00001385PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386 PyObject *unicode, /* Unicode object */
1387 PyObject *mapping /* character mapping
1388 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001389 );
1390
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001391#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001392PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 const Py_UNICODE *data, /* Unicode char buffer */
1394 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1395 PyObject *mapping, /* character mapping
1396 (unicode ordinal -> char ordinal) */
1397 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001398 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001399#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001400
1401/* Translate a Py_UNICODE buffer of the given length by applying a
1402 character mapping table to it and return the resulting Unicode
1403 object.
1404
1405 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001407
1408 Mapping tables may be dictionaries or sequences. Unmapped character
1409 ordinals (ones which cause a LookupError) are left untouched and
1410 are copied as-is.
1411
1412*/
1413
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001414#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001415PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 const Py_UNICODE *data, /* Unicode char buffer */
1417 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1418 PyObject *table, /* Translate table */
1419 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001420 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001421#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001422
Victor Stinner99b95382011-07-04 14:23:54 +02001423#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001424
Guido van Rossumefec1152000-03-28 02:01:15 +00001425/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001426
Mark Hammond91a681d2002-08-12 07:21:58 +00001427PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001428 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001429 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001430 const char *errors /* error handling */
1431 );
1432
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001433PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1434 const char *string, /* MBCS encoded string */
1435 Py_ssize_t length, /* size of string */
1436 const char *errors, /* error handling */
1437 Py_ssize_t *consumed /* bytes consumed */
1438 );
1439
Mark Hammond91a681d2002-08-12 07:21:58 +00001440PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001441 PyObject *unicode /* Unicode object */
1442 );
1443
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001444#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001445PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001446 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001447 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001448 const char *errors /* error handling */
1449 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001450#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001451
Victor Stinner99b95382011-07-04 14:23:54 +02001452#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001453
Guido van Rossum9e896b32000-04-05 20:11:21 +00001454/* --- Decimal Encoder ---------------------------------------------------- */
1455
1456/* Takes a Unicode string holding a decimal value and writes it into
1457 an output buffer using standard ASCII digit codes.
1458
1459 The output buffer has to provide at least length+1 bytes of storage
1460 area. The output string is 0-terminated.
1461
1462 The encoder converts whitespace to ' ', decimal characters to their
1463 corresponding ASCII digit and all other Latin-1 characters except
1464 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1465 are treated as errors. This includes embedded NULL bytes.
1466
1467 Error handling is defined by the errors argument:
1468
1469 NULL or "strict": raise a ValueError
1470 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001472 "replace": replaces illegal characters with '?'
1473
1474 Returns 0 on success, -1 on failure.
1475
1476*/
1477
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001478#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001479PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 Py_UNICODE *s, /* Unicode buffer */
1481 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1482 char *output, /* Output buffer; must have size >= length */
1483 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001484 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001485#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001486
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001487/* Transforms code points that have decimal digit property to the
1488 corresponding ASCII digit code points.
1489
1490 Returns a new Unicode string on success, NULL on failure.
1491*/
1492
Georg Brandlb5503082010-12-05 11:40:48 +00001493#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001494PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1495 Py_UNICODE *s, /* Unicode buffer */
1496 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1497 );
Georg Brandlb5503082010-12-05 11:40:48 +00001498#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1501 as argument instead of a raw buffer and length. This function additionally
1502 transforms spaces to ASCII because this is what the callers in longobject,
1503 floatobject, and complexobject did anyways. */
1504
1505#ifndef Py_LIMITED_API
1506PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1507 PyObject *unicode /* Unicode object */
1508 );
1509#endif
1510
Martin v. Löwis011e8422009-05-05 04:43:17 +00001511/* --- File system encoding ---------------------------------------------- */
1512
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001513/* ParseTuple converter: encode str objects to bytes using
1514 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001515
1516PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1517
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001518/* ParseTuple converter: decode bytes objects to unicode using
1519 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1520
1521PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1522
Victor Stinner77c38622010-05-14 15:58:55 +00001523/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1524 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001525
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001526 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1527 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001528
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001529 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001530*/
1531
1532PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1533 const char *s /* encoded string */
1534 );
1535
Victor Stinner77c38622010-05-14 15:58:55 +00001536/* Decode a string using Py_FileSystemDefaultEncoding
1537 and the "surrogateescape" error handler.
1538
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001539 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1540 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001541*/
1542
Martin v. Löwis011e8422009-05-05 04:43:17 +00001543PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1544 const char *s, /* encoded string */
1545 Py_ssize_t size /* size */
1546 );
1547
Victor Stinnerae6265f2010-05-15 16:27:27 +00001548/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001549 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001550
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001551 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1552 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001553*/
1554
1555PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1556 PyObject *unicode
1557 );
1558
Guido van Rossumd8225182000-03-10 22:33:05 +00001559/* --- Methods & Slots ----------------------------------------------------
1560
1561 These are capable of handling Unicode objects and strings on input
1562 (we refer to them as strings in the descriptions) and return
1563 Unicode objects or integers as apporpriate. */
1564
1565/* Concat two strings giving a new Unicode string. */
1566
Mark Hammond91a681d2002-08-12 07:21:58 +00001567PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001568 PyObject *left, /* Left string */
1569 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001570 );
1571
Walter Dörwald1ab83302007-05-18 17:15:44 +00001572/* Concat two strings and put the result in *pleft
1573 (sets *pleft to NULL on error) */
1574
1575PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001576 PyObject **pleft, /* Pointer to left string */
1577 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001578 );
1579
1580/* Concat two strings, put the result in *pleft and drop the right object
1581 (sets *pleft to NULL on error) */
1582
1583PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 PyObject **pleft, /* Pointer to left string */
1585 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001586 );
1587
Guido van Rossumd8225182000-03-10 22:33:05 +00001588/* Split a string giving a list of Unicode strings.
1589
1590 If sep is NULL, splitting will be done at all whitespace
1591 substrings. Otherwise, splits occur at the given separator.
1592
1593 At most maxsplit splits will be done. If negative, no limit is set.
1594
1595 Separators are not included in the resulting list.
1596
1597*/
1598
Mark Hammond91a681d2002-08-12 07:21:58 +00001599PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 PyObject *s, /* String to split */
1601 PyObject *sep, /* String separator */
1602 Py_ssize_t maxsplit /* Maxsplit count */
1603 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001604
1605/* Dito, but split at line breaks.
1606
1607 CRLF is considered to be one line break. Line breaks are not
1608 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609
Mark Hammond91a681d2002-08-12 07:21:58 +00001610PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611 PyObject *s, /* String to split */
1612 int keepends /* If true, line end markers are included */
1613 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001614
Thomas Wouters477c8d52006-05-27 19:21:47 +00001615/* Partition a string using a given separator. */
1616
1617PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001618 PyObject *s, /* String to partition */
1619 PyObject *sep /* String separator */
1620 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001621
1622/* Partition a string using a given separator, searching from the end of the
1623 string. */
1624
1625PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 PyObject *s, /* String to partition */
1627 PyObject *sep /* String separator */
1628 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001629
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001630/* Split a string giving a list of Unicode strings.
1631
1632 If sep is NULL, splitting will be done at all whitespace
1633 substrings. Otherwise, splits occur at the given separator.
1634
1635 At most maxsplit splits will be done. But unlike PyUnicode_Split
1636 PyUnicode_RSplit splits from the end of the string. If negative,
1637 no limit is set.
1638
1639 Separators are not included in the resulting list.
1640
1641*/
1642
1643PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 PyObject *s, /* String to split */
1645 PyObject *sep, /* String separator */
1646 Py_ssize_t maxsplit /* Maxsplit count */
1647 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001648
Guido van Rossumd8225182000-03-10 22:33:05 +00001649/* Translate a string by applying a character mapping table to it and
1650 return the resulting Unicode object.
1651
1652 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001654
1655 Mapping tables may be dictionaries or sequences. Unmapped character
1656 ordinals (ones which cause a LookupError) are left untouched and
1657 are copied as-is.
1658
1659*/
1660
Mark Hammond91a681d2002-08-12 07:21:58 +00001661PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 PyObject *str, /* String */
1663 PyObject *table, /* Translate table */
1664 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001665 );
1666
1667/* Join a sequence of strings using the given separator and return
1668 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669
Mark Hammond91a681d2002-08-12 07:21:58 +00001670PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 PyObject *separator, /* Separator string */
1672 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001673 );
1674
1675/* Return 1 if substr matches str[start:end] at the given tail end, 0
1676 otherwise. */
1677
Martin v. Löwis18e16552006-02-15 17:27:45 +00001678PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 PyObject *str, /* String */
1680 PyObject *substr, /* Prefix or Suffix string */
1681 Py_ssize_t start, /* Start index */
1682 Py_ssize_t end, /* Stop index */
1683 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001684 );
1685
1686/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001687 given search direction or -1 if not found. -2 is returned in case
1688 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001689
Martin v. Löwis18e16552006-02-15 17:27:45 +00001690PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001691 PyObject *str, /* String */
1692 PyObject *substr, /* Substring to find */
1693 Py_ssize_t start, /* Start index */
1694 Py_ssize_t end, /* Stop index */
1695 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001696 );
1697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698/* Like PyUnicode_Find, but search for single character only. */
1699PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1700 PyObject *str,
1701 Py_UCS4 ch,
1702 Py_ssize_t start,
1703 Py_ssize_t end,
1704 int direction
1705 );
1706
Barry Warsaw51ac5802000-03-20 16:36:48 +00001707/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001708
Martin v. Löwis18e16552006-02-15 17:27:45 +00001709PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 PyObject *str, /* String */
1711 PyObject *substr, /* Substring to count */
1712 Py_ssize_t start, /* Start index */
1713 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001714 );
1715
Barry Warsaw51ac5802000-03-20 16:36:48 +00001716/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001717 and return the resulting Unicode object. */
1718
Mark Hammond91a681d2002-08-12 07:21:58 +00001719PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 PyObject *str, /* String */
1721 PyObject *substr, /* Substring to find */
1722 PyObject *replstr, /* Substring to replace */
1723 Py_ssize_t maxcount /* Max. number of replacements to apply;
1724 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001725 );
1726
1727/* Compare two strings and return -1, 0, 1 for less than, equal,
1728 greater than resp. */
1729
Mark Hammond91a681d2002-08-12 07:21:58 +00001730PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001731 PyObject *left, /* Left string */
1732 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001733 );
1734
Martin v. Löwis5b222132007-06-10 09:51:05 +00001735PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1736 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001737 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001738 );
1739
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001740/* Rich compare two strings and return one of the following:
1741
1742 - NULL in case an exception was raised
1743 - Py_True or Py_False for successfuly comparisons
1744 - Py_NotImplemented in case the type combination is unknown
1745
1746 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1747 case the conversion of the arguments to Unicode fails with a
1748 UnicodeDecodeError.
1749
1750 Possible values for op:
1751
1752 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1753
1754*/
1755
1756PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001757 PyObject *left, /* Left string */
1758 PyObject *right, /* Right string */
1759 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001760 );
1761
Thomas Wouters7e474022000-07-16 12:04:32 +00001762/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001763 the resulting Unicode string. */
1764
Mark Hammond91a681d2002-08-12 07:21:58 +00001765PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 PyObject *format, /* Format string */
1767 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001768 );
1769
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001770/* Checks whether element is contained in container and return 1/0
1771 accordingly.
1772
1773 element has to coerce to an one element Unicode string. -1 is
1774 returned in case of an error. */
1775
Mark Hammond91a681d2002-08-12 07:21:58 +00001776PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 PyObject *container, /* Container string */
1778 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001779 );
1780
Martin v. Löwis47383402007-08-15 07:32:56 +00001781/* Checks whether argument is a valid identifier. */
1782
1783PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1784
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001785#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001786/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001787PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001788 PyUnicodeObject *self,
1789 int striptype,
1790 PyObject *sepobj
1791 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001792#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001793
Eric Smith5807c412008-05-11 21:00:57 +00001794/* Using the current locale, insert the thousands grouping
1795 into the string pointed to by buffer. For the argument descriptions,
1796 see Objects/stringlib/localeutil.h */
1797
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001798#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001799PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1800 Py_ssize_t n_buffer,
1801 Py_UNICODE *digits,
1802 Py_ssize_t n_digits,
1803 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001804#endif
Eric Smith5807c412008-05-11 21:00:57 +00001805
Eric Smitha3b1ac82009-04-03 14:45:06 +00001806/* Using explicit passed-in values, insert the thousands grouping
1807 into the string pointed to by buffer. For the argument descriptions,
1808 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001809#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1811 int kind,
1812 void *buffer,
1813 Py_ssize_t n_buffer,
1814 void *digits,
1815 Py_ssize_t n_digits,
1816 Py_ssize_t min_width,
1817 const char *grouping,
1818 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001819#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001820/* === Characters Type APIs =============================================== */
1821
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001822/* Helper array used by Py_UNICODE_ISSPACE(). */
1823
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001824#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001825PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1826
Guido van Rossumd8225182000-03-10 22:33:05 +00001827/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001829
1830 These APIs are implemented in Objects/unicodectype.c.
1831
1832*/
1833
Mark Hammond91a681d2002-08-12 07:21:58 +00001834PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001835 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001836 );
1837
Mark Hammond91a681d2002-08-12 07:21:58 +00001838PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001839 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001840 );
1841
Mark Hammond91a681d2002-08-12 07:21:58 +00001842PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001843 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001844 );
1845
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001846PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001847 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001848 );
1849
1850PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001851 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001852 );
1853
Mark Hammond91a681d2002-08-12 07:21:58 +00001854PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001855 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001856 );
1857
Mark Hammond91a681d2002-08-12 07:21:58 +00001858PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001859 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001860 );
1861
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001862PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1863 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001864 );
1865
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001866PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1867 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001868 );
1869
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001870PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1871 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001872 );
1873
Mark Hammond91a681d2002-08-12 07:21:58 +00001874PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001875 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001876 );
1877
Mark Hammond91a681d2002-08-12 07:21:58 +00001878PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001879 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001880 );
1881
Mark Hammond91a681d2002-08-12 07:21:58 +00001882PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001883 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001884 );
1885
Mark Hammond91a681d2002-08-12 07:21:58 +00001886PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001887 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001888 );
1889
Mark Hammond91a681d2002-08-12 07:21:58 +00001890PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001891 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001892 );
1893
Mark Hammond91a681d2002-08-12 07:21:58 +00001894PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001895 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001896 );
1897
Georg Brandl559e5d72008-06-11 18:37:52 +00001898PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001899 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001900 );
1901
Mark Hammond91a681d2002-08-12 07:21:58 +00001902PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001903 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001904 );
1905
Victor Stinneref8d95c2010-08-16 22:03:11 +00001906PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1907 const Py_UNICODE *u
1908 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001909
1910PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001911 Py_UNICODE *s1,
1912 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001913
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001914PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1915 Py_UNICODE *s1, const Py_UNICODE *s2);
1916
Martin v. Löwis5b222132007-06-10 09:51:05 +00001917PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001918 Py_UNICODE *s1,
1919 const Py_UNICODE *s2,
1920 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001921
1922PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001923 const Py_UNICODE *s1,
1924 const Py_UNICODE *s2
1925 );
1926
1927PyAPI_FUNC(int) Py_UNICODE_strncmp(
1928 const Py_UNICODE *s1,
1929 const Py_UNICODE *s2,
1930 size_t n
1931 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001932
1933PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001934 const Py_UNICODE *s,
1935 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001936 );
1937
Victor Stinner331ea922010-08-10 16:37:20 +00001938PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001939 const Py_UNICODE *s,
1940 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001941 );
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943PyAPI_FUNC(size_t) Py_UCS4_strlen(
1944 const Py_UCS4 *u
1945 );
1946
1947PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1948 Py_UCS4 *s1,
1949 const Py_UCS4 *s2);
1950
1951PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1952 Py_UCS4 *s1, const Py_UCS4 *s2);
1953
1954PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1955 Py_UCS4 *s1,
1956 const Py_UCS4 *s2,
1957 size_t n);
1958
1959PyAPI_FUNC(int) Py_UCS4_strcmp(
1960 const Py_UCS4 *s1,
1961 const Py_UCS4 *s2
1962 );
1963
1964PyAPI_FUNC(int) Py_UCS4_strncmp(
1965 const Py_UCS4 *s1,
1966 const Py_UCS4 *s2,
1967 size_t n
1968 );
1969
1970PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1971 const Py_UCS4 *s,
1972 Py_UCS4 c
1973 );
1974
1975PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1976 const Py_UCS4 *s,
1977 Py_UCS4 c
1978 );
1979
Victor Stinner71133ff2010-09-01 23:43:53 +00001980/* Create a copy of a unicode string ending with a nul character. Return NULL
1981 and raise a MemoryError exception on memory allocation failure, otherwise
1982 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1983
Victor Stinner46408602010-09-03 16:18:00 +00001984PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001985 PyObject *unicode
1986 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001987#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001988
Guido van Rossumd8225182000-03-10 22:33:05 +00001989#ifdef __cplusplus
1990}
1991#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001992#endif /* !Py_UNICODEOBJECT_H */