blob: 99dcdd8b77393289ec43cf1d25236468aa7917d4 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
209 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200210 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000211 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212 struct {
213 /*
214 SSTATE_NOT_INTERNED (0)
215 SSTATE_INTERNED_MORTAL (1)
216 SSTATE_INTERNED_IMMORTAL (2)
217
218 If interned != SSTATE_NOT_INTERNED, the two references from the
219 dictionary to this object are *not* counted in ob_refcnt.
220 */
221 unsigned int interned:2;
222 /* Character size:
223
224 PyUnicode_WCHAR_KIND (0): wchar_t*
225 PyUnicode_1BYTE_KIND (1): Py_UCS1*
226 PyUnicode_2BYTE_KIND (2): Py_UCS2*
227 PyUnicode_4BYTE_KIND (3): Py_UCS4*
228 */
229 unsigned int kind:2;
230 /* Compact is with respect to the allocation scheme. Compact unicode
231 objects only require one memory block while non-compact objects use
232 one block for the PyUnicodeObject struct and another for its data
233 buffer. */
234 unsigned int compact:1;
235 /* Compact objects which are ASCII-only also have the state.compact
236 flag set, and use the PyASCIIObject struct. */
237 unsigned int ascii:1;
238 /* The ready flag indicates whether the object layout is initialized
239 completely. This means that this is either a compact object, or
240 the data pointer is filled out. The bit is redundant, and helps
241 to minimize the test in PyUnicode_IS_READY(). */
242 unsigned int ready:1;
243 } state;
244 wchar_t *wstr; /* wchar_t representation (null-terminated) */
245} PyASCIIObject;
246
247/* Non-ASCII strings allocated through PyUnicode_New use the
248 PyCompactUnicodeOject structure. state.compact is set, and the data
249 immediately follow the structure. */
250typedef struct {
251 PyASCIIObject _base;
252 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
253 * terminating \0. */
254 char *utf8; /* UTF-8 representation (null-terminated) */
255 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
256 * surrogates count as two code points. */
257} PyCompactUnicodeObject;
258
259/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
260 PyUnicodeObject structure. The actual string data is initially in the wstr
261 block, and copied into the data block using PyUnicode_Ready. */
262typedef struct {
263 PyCompactUnicodeObject _base;
264 union {
265 void *any;
266 Py_UCS1 *latin1;
267 Py_UCS2 *ucs2;
268 Py_UCS4 *ucs4;
269 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000270} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000271#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000272
Mark Hammond91a681d2002-08-12 07:21:58 +0000273PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000274PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000275
Thomas Wouters27d517b2007-02-25 20:39:11 +0000276#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000277 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
278#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000279
280/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000281#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282
283#define PyUnicode_WSTR_LENGTH(op) \
284 (((PyASCIIObject*)op)->state.ascii ? \
285 ((PyASCIIObject*)op)->length : \
286 ((PyCompactUnicodeObject*)op)->wstr_length)
287
288/* Returns the deprecated Py_UNICODE representation's size in code units
289 (this includes surrogate pairs as 2 units).
290 If the Py_UNICODE representation is not available, it will be computed
291 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
292
Guido van Rossumd8225182000-03-10 22:33:05 +0000293#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 (assert(PyUnicode_Check(op)), \
295 (((PyASCIIObject *)(op))->wstr) ? \
296 PyUnicode_WSTR_LENGTH(op) : \
297 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
298 PyUnicode_WSTR_LENGTH(op)))
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
302
303/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
304 representation on demand. Using this macro is very inefficient now,
305 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
306 use PyUnicode_WRITE() and PyUnicode_READ(). */
307
Guido van Rossumd8225182000-03-10 22:33:05 +0000308#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 (assert(PyUnicode_Check(op)), \
310 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
311 PyUnicode_AsUnicode((PyObject *)(op)))
312
Guido van Rossumd8225182000-03-10 22:33:05 +0000313#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 ((const char *)(PyUnicode_AS_UNICODE(op)))
315
316
317/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
318
319/* Values for PyUnicodeObject.state: */
320
321/* Interning state. */
322#define SSTATE_NOT_INTERNED 0
323#define SSTATE_INTERNED_MORTAL 1
324#define SSTATE_INTERNED_IMMORTAL 2
325
326#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
327
328/* String contains only wstr byte characters. This is only possible
329 when the string was created with a legacy API and PyUnicode_Ready()
330 has not been called yet. */
331#define PyUnicode_WCHAR_KIND 0
332
333/* Return values of the PyUnicode_KIND() macro: */
334
335#define PyUnicode_1BYTE_KIND 1
336#define PyUnicode_2BYTE_KIND 2
337#define PyUnicode_4BYTE_KIND 3
338
339
340/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200341 this can be 1, 2 or 4.
342
343 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344#define PyUnicode_CHARACTER_SIZE(op) \
345 (1 << (PyUnicode_KIND(op) - 1))
346
347/* Return pointers to the canonical representation casted as unsigned char,
348 Py_UCS2, or Py_UCS4 for direct character access.
349 No checks are performed, use PyUnicode_CHARACTER_SIZE or
350 PyUnicode_KIND() before to ensure these will work correctly. */
351
352#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
353#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
354#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
355
356/* Return true if the string is compact or 0 if not.
357 No type checks or Ready calls are performed. */
358#define PyUnicode_IS_COMPACT(op) \
359 (((PyASCIIObject*)(op))->state.compact)
360
Victor Stinner157f83f2011-09-28 21:41:31 +0200361/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362#define PyUnicode_KIND(op) \
363 (assert(PyUnicode_Check(op)), \
364 assert(PyUnicode_IS_READY(op)), \
365 ((PyASCIIObject *)(op))->state.kind)
366
Victor Stinner157f83f2011-09-28 21:41:31 +0200367/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368#define _PyUnicode_COMPACT_DATA(op) \
369 (PyUnicode_IS_COMPACT_ASCII(op) ? \
370 ((void*)((PyASCIIObject*)(op) + 1)) : \
371 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
372
373#define _PyUnicode_NONCOMPACT_DATA(op) \
374 (assert(((PyUnicodeObject*)(op))->data.any), \
375 ((((PyUnicodeObject *)(op))->data.any)))
376
377#define PyUnicode_DATA(op) \
378 (assert(PyUnicode_Check(op)), \
379 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
380 _PyUnicode_NONCOMPACT_DATA(op))
381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200383 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384
Victor Stinner4584a5b2011-10-01 02:39:37 +0200385 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
387
388/* In the access macros below, "kind" may be evaluated more than once.
389 All other macro parameters are evaluated exactly once, so it is safe
390 to put side effects into them (such as increasing the index). */
391
392/* Write into the canonical representation, this macro does not do any sanity
393 checks and is intended for usage in loops. The caller should cache the
394 kind and data pointers optained form other macro calls.
395 index is the index in the string (starts at 0) and value is the new
396 code point value which shoule be written to that location. */
397#define PyUnicode_WRITE(kind, data, index, value) \
398 do { \
399 switch ((kind)) { \
400 case PyUnicode_1BYTE_KIND: { \
401 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
402 break; \
403 } \
404 case PyUnicode_2BYTE_KIND: { \
405 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
406 break; \
407 } \
408 default: { \
409 assert((kind) == PyUnicode_4BYTE_KIND); \
410 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
411 } \
412 } \
413 } while (0)
414
415/* Read a code point form the string's canonical representation. No checks
416 or ready calls are performed. */
417#define PyUnicode_READ(kind, data, index) \
418 ((Py_UCS4) \
419 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200420 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200421 ((kind) == PyUnicode_2BYTE_KIND ? \
422 ((const Py_UCS2 *)(data))[(index)] : \
423 ((const Py_UCS4 *)(data))[(index)] \
424 ) \
425 ))
426
427/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
428 calls PyUnicode_KIND() and might call it twice. For single reads, use
429 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
430 cache kind and use PyUnicode_READ instead. */
431#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200432 (assert(PyUnicode_Check(unicode)), \
433 assert(PyUnicode_IS_READY(unicode)), \
434 (Py_UCS4) \
435 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
436 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
437 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
438 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
439 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
440 ) \
441 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442
443/* Returns the length of the unicode string. The caller has to make sure that
444 the string has it's canonical representation set before calling
445 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
446#define PyUnicode_GET_LENGTH(op) \
447 (assert(PyUnicode_Check(op)), \
448 assert(PyUnicode_IS_READY(op)), \
449 ((PyASCIIObject *)(op))->length)
450
451
452/* Fast check to determine whether an object is ready. Equivalent to
453 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
454
455#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
456
457/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
458 case. If the canonical representation is not yet set, it will still call
459 PyUnicode_Ready().
460 Returns 0 on success and -1 on errors. */
461#define PyUnicode_READY(op) \
462 (assert(PyUnicode_Check(op)), \
463 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200464 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466/* Return a maximum character value which is suitable for creating another
467 string based on op. This is always an approximation but more efficient
468 than interating over the string. */
469#define PyUnicode_MAX_CHAR_VALUE(op) \
470 (assert(PyUnicode_IS_READY(op)), \
471 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
472 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
473 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
474 (0x7fU) : (0xffU) \
475 ) : \
476 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
477 (0xffffU) : (0x10ffffU) \
478 ))))
479
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000480#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000481
482/* --- Constants ---------------------------------------------------------- */
483
484/* This Unicode character will be used as replacement character during
485 decoding if the errors argument is set to "replace". Note: the
486 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
487 Unicode 3.0. */
488
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200489#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000490
491/* === Public API ========================================================= */
492
493/* --- Plain Py_UNICODE --------------------------------------------------- */
494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200495/* With PEP 393, this is the recommended way to allocate a new unicode object.
496 This function will allocate the object and its buffer in a single memory
497 block. Objects created using this function are not resizable. */
498#ifndef Py_LIMITED_API
499PyAPI_FUNC(PyObject*) PyUnicode_New(
500 Py_ssize_t size, /* Number of code points in the new string */
501 Py_UCS4 maxchar /* maximum code point value in the string */
502 );
503#endif
504
Victor Stinnerd8f65102011-09-29 19:43:17 +0200505/* Initializes the canonical string representation from a the deprecated
506 wstr/Py_UNICODE representation. This function is used to convert Unicode
507 objects which were created using the old API to the new flexible format
508 introduced with PEP 393.
509
510 Don't call this function directly, use the public PyUnicode_READY() macro
511 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200512#ifndef Py_LIMITED_API
513PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200514 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515 );
516#endif
517
Victor Stinner034f6cf2011-09-30 02:26:44 +0200518/* Get a copy of a Unicode string. */
519PyAPI_FUNC(PyObject*) PyUnicode_Copy(
520 PyObject *unicode
521 );
522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200524 character conversion when necessary and falls back to memcpy if possible.
525
Victor Stinnera0702ab2011-09-29 14:14:38 +0200526 Fail if to is too small (smaller than how_many or smaller than
527 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
528 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200529
530 Return the number of written character, or return -1 and raise an exception
531 on error.
532
533 Pseudo-code:
534
535 how_many = min(how_many, len(from) - from_start)
536 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
537 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200538
539 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200540 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200542PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 PyObject *to,
544 Py_ssize_t to_start,
545 PyObject *from,
546 Py_ssize_t from_start,
547 Py_ssize_t how_many
548 );
549#endif
550
Guido van Rossumd8225182000-03-10 22:33:05 +0000551/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000552 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000553
554 u may be NULL which causes the contents to be undefined. It is the
555 user's responsibility to fill in the needed data afterwards. Note
556 that modifying the Unicode object contents after construction is
557 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000558
559 The buffer is copied into the new object. */
560
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000561#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000562PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000563 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000564 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000565 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000566#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000567
Georg Brandl952867a2010-06-27 10:17:12 +0000568/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000570 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000571 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000572 );
573
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000574/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000576PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000577 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000578 );
579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200580#ifndef Py_LIMITED_API
581PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
582 int kind,
583 const void *buffer,
584 Py_ssize_t size);
585#endif
586
587PyAPI_FUNC(PyObject*) PyUnicode_Substring(
588 PyObject *str,
589 Py_ssize_t start,
590 Py_ssize_t end);
591
592/* Copy the string into a UCS4 buffer including the null character is copy_null
593 is set. Return NULL and raise an exception on error. Raise a ValueError if
594 the buffer is smaller than the string. Return buffer on success.
595
596 buflen is the length of the buffer in (Py_UCS4) characters. */
597PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
598 PyObject *unicode,
599 Py_UCS4* buffer,
600 Py_ssize_t buflen,
601 int copy_null);
602
603/* Copy the string into a UCS4 buffer. A new buffer is allocated using
604 * PyMem_Malloc; if this fails, NULL is returned with a memory error
605 exception set. */
606PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
607
Guido van Rossumd8225182000-03-10 22:33:05 +0000608/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609 Py_UNICODE buffer.
610 If the wchar_t/Py_UNICODE representation is not yet available, this
611 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000612
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000613#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000614PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000616 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000617#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619/* Return a read-only pointer to the Unicode object's internal
620 Py_UNICODE buffer and save the length at size.
621 If the wchar_t/Py_UNICODE representation is not yet available, this
622 function will calculate it. */
623
624#ifndef Py_LIMITED_API
625PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
626 PyObject *unicode, /* Unicode object */
627 Py_ssize_t *size /* location where to save the length */
628 );
629#endif
630
Guido van Rossumd8225182000-03-10 22:33:05 +0000631/* Get the length of the Unicode object. */
632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
634 PyObject *unicode
635);
636
Victor Stinner157f83f2011-09-28 21:41:31 +0200637/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638 string representation. */
639
Martin v. Löwis18e16552006-02-15 17:27:45 +0000640PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000641 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000642 );
643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644/* Read a character from the string. */
645
646PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
647 PyObject *unicode,
648 Py_ssize_t index
649 );
650
651/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200652 PyUnicode_New, must not be shared, and must not have been hashed yet.
653
654 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
656PyAPI_FUNC(int) PyUnicode_WriteChar(
657 PyObject *unicode,
658 Py_ssize_t index,
659 Py_UCS4 character
660 );
661
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000662#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000663/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000664PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000665#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000666
Guido van Rossum52c23592000-04-10 13:41:41 +0000667/* Resize an already allocated Unicode object to the new size length.
668
669 *unicode is modified to point to the new (resized) object and 0
670 returned on success.
671
672 This API may only be called by the function which also called the
673 Unicode constructor. The refcount on the object must be 1. Otherwise,
674 an error is returned.
675
676 Error handling is implemented as follows: an exception is set, -1
677 is returned and *unicode left untouched.
678
679*/
680
Mark Hammond91a681d2002-08-12 07:21:58 +0000681PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 PyObject **unicode, /* Pointer to the Unicode object */
683 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000684 );
685
Guido van Rossumd8225182000-03-10 22:33:05 +0000686/* Coerce obj to an Unicode object and return a reference with
687 *incremented* refcount.
688
689 Coercion is done in the following way:
690
Georg Brandl952867a2010-06-27 10:17:12 +0000691 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000692 under the assumptions that they contain data using the UTF-8
693 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000694
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000695 2. All other objects (including Unicode objects) raise an
696 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000697
698 The API returns NULL in case of an error. The caller is responsible
699 for decref'ing the returned objects.
700
701*/
702
Mark Hammond91a681d2002-08-12 07:21:58 +0000703PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000705 const char *encoding, /* encoding */
706 const char *errors /* error handling */
707 );
708
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000709/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000710 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000712 Unicode objects are passed back as-is (subclasses are converted to
713 true Unicode objects), all other objects are delegated to
714 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000715 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000716
717 The API returns NULL in case of an error. The caller is responsible
718 for decref'ing the returned objects.
719
720*/
721
Mark Hammond91a681d2002-08-12 07:21:58 +0000722PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000724 );
725
Victor Stinner1205f272010-09-11 00:54:47 +0000726PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
727 const char *format, /* ASCII-encoded string */
728 va_list vargs
729 );
730PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
731 const char *format, /* ASCII-encoded string */
732 ...
733 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000735#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000736/* Format the object based on the format_spec, as defined in PEP 3101
737 (Advanced String Formatting). */
738PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739 PyObject *format_spec,
740 Py_ssize_t start,
741 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000742#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000743
Walter Dörwald16807132007-05-25 13:52:07 +0000744PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
745PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000746PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
747 const char *u /* UTF-8 encoded string */
748 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000749#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000750PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000751#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000752
753/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754#define PyUnicode_CHECK_INTERNED(op) \
755 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000756
Guido van Rossumd8225182000-03-10 22:33:05 +0000757/* --- wchar_t support for platforms which support it --------------------- */
758
759#ifdef HAVE_WCHAR_H
760
Georg Brandl952867a2010-06-27 10:17:12 +0000761/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000762 size.
763
764 The buffer is copied into the new object. */
765
Mark Hammond91a681d2002-08-12 07:21:58 +0000766PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000767 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000768 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000769 );
770
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000771/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000772 most size wchar_t characters are copied.
773
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000774 Note that the resulting wchar_t string may or may not be
775 0-terminated. It is the responsibility of the caller to make sure
776 that the wchar_t string is 0-terminated in case this is required by
777 the application.
778
779 Returns the number of wchar_t characters copied (excluding a
780 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000781 error. */
782
Martin v. Löwis18e16552006-02-15 17:27:45 +0000783PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000784 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000785 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000786 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000787 );
788
Victor Stinner137c34c2010-09-29 10:25:54 +0000789/* Convert the Unicode object to a wide character string. The output string
790 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200791 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000792
793 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
794 on success. On error, returns NULL, *size is undefined and raises a
795 MemoryError. */
796
797PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000798 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000799 Py_ssize_t *size /* number of characters of the result */
800 );
801
Victor Stinner9f789e72011-10-01 03:57:28 +0200802#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200804#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805
Guido van Rossumd8225182000-03-10 22:33:05 +0000806#endif
807
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000808/* --- Unicode ordinals --------------------------------------------------- */
809
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810/* Create a Unicode Object from the given Unicode code point ordinal.
811
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000812 The ordinal must be in range(0x10000) on narrow Python builds
813 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
814 raised in case it is not.
815
816*/
817
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000818PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000819
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000820/* --- Free-list management ----------------------------------------------- */
821
822/* Clear the free list used by the Unicode implementation.
823
824 This can be used to release memory used for objects on the free
825 list back to the Python memory allocator.
826
827*/
828
829PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
830
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000832
833 Many of these APIs take two arguments encoding and errors. These
834 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000835 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000836
Georg Brandl952867a2010-06-27 10:17:12 +0000837 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000838
839 Error handling is set by errors which may also be set to NULL
840 meaning to use the default handling defined for the codec. Default
841 error handling for all builtin codecs is "strict" (ValueErrors are
842 raised).
843
844 The codecs all use a similar interface. Only deviation from the
845 generic ones are documented.
846
847*/
848
Fred Drakecb093fe2000-05-09 19:51:53 +0000849/* --- Manage the default encoding ---------------------------------------- */
850
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000851/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000852 Unicode object unicode and the size of the encoded representation
853 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000854
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000855 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000856
Victor Stinner157f83f2011-09-28 21:41:31 +0200857 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 and subsequent calls will return the same string. The memory is relased
859 when the unicodeobject is deallocated.
860
861 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
862 support the previous internal function with the same behaviour.
863
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000864 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000865 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000866
867 *** If you need to access the Unicode object as UTF-8 bytes string,
868 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000869*/
870
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000871#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000873 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000874 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000876#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000877
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000878/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000879 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
882 in the unicodeobject.
883
884 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
885 support the previous internal function with the same behaviour.
886
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000887 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000888 extracted from the returned data.
889
890 *** This API is for interpreter INTERNAL USE ONLY and will likely
891 *** be removed or changed for Python 3.1.
892
893 *** If you need to access the Unicode object as UTF-8 bytes string,
894 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000895
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000896*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000897
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000898#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
900#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000901#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000902
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000903/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000904
Mark Hammond91a681d2002-08-12 07:21:58 +0000905PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000906
Guido van Rossumd8225182000-03-10 22:33:05 +0000907/* --- Generic Codecs ----------------------------------------------------- */
908
909/* Create a Unicode object by decoding the encoded string s of the
910 given size. */
911
Mark Hammond91a681d2002-08-12 07:21:58 +0000912PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000913 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000914 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000915 const char *encoding, /* encoding */
916 const char *errors /* error handling */
917 );
918
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000919/* Decode a Unicode object unicode and return the result as Python
920 object. */
921
922PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 PyObject *unicode, /* Unicode object */
924 const char *encoding, /* encoding */
925 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000926 );
927
928/* Decode a Unicode object unicode and return the result as Unicode
929 object. */
930
931PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000932 PyObject *unicode, /* Unicode object */
933 const char *encoding, /* encoding */
934 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000935 );
936
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000937/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000938 Python string object. */
939
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000940#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000941PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000942 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000944 const char *encoding, /* encoding */
945 const char *errors /* error handling */
946 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000947#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000948
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000949/* Encodes a Unicode object and returns the result as Python
950 object. */
951
952PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000953 PyObject *unicode, /* Unicode object */
954 const char *encoding, /* encoding */
955 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000956 );
957
Guido van Rossumd8225182000-03-10 22:33:05 +0000958/* Encodes a Unicode object and returns the result as Python string
959 object. */
960
Mark Hammond91a681d2002-08-12 07:21:58 +0000961PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000962 PyObject *unicode, /* Unicode object */
963 const char *encoding, /* encoding */
964 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000965 );
966
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000967/* Encodes a Unicode object and returns the result as Unicode
968 object. */
969
970PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000971 PyObject *unicode, /* Unicode object */
972 const char *encoding, /* encoding */
973 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000974 );
975
976/* Build an encoding map. */
977
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000978PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
979 PyObject* string /* 256 character map */
980 );
981
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982/* --- UTF-7 Codecs ------------------------------------------------------- */
983
Mark Hammond91a681d2002-08-12 07:21:58 +0000984PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000985 const char *string, /* UTF-7 encoded string */
986 Py_ssize_t length, /* size of string */
987 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 );
989
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000990PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 const char *string, /* UTF-7 encoded string */
992 Py_ssize_t length, /* size of string */
993 const char *errors, /* error handling */
994 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000995 );
996
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000997#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 const Py_UNICODE *data, /* Unicode char buffer */
1000 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1001 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1002 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1003 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001004 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001005#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006
Guido van Rossumd8225182000-03-10 22:33:05 +00001007/* --- UTF-8 Codecs ------------------------------------------------------- */
1008
Mark Hammond91a681d2002-08-12 07:21:58 +00001009PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001010 const char *string, /* UTF-8 encoded string */
1011 Py_ssize_t length, /* size of string */
1012 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001013 );
1014
Walter Dörwald69652032004-09-07 20:24:22 +00001015PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 const char *string, /* UTF-8 encoded string */
1017 Py_ssize_t length, /* size of string */
1018 const char *errors, /* error handling */
1019 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001020 );
1021
Mark Hammond91a681d2002-08-12 07:21:58 +00001022PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 );
1025
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001026#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1028 PyObject *unicode,
1029 const char *errors);
1030
Mark Hammond91a681d2002-08-12 07:21:58 +00001031PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 const Py_UNICODE *data, /* Unicode char buffer */
1033 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1034 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001035 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001036#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001037
Walter Dörwald41980ca2007-08-16 21:55:45 +00001038/* --- UTF-32 Codecs ------------------------------------------------------ */
1039
1040/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1041 the corresponding Unicode object.
1042
1043 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001044 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001045
1046 If byteorder is non-NULL, the decoder starts decoding using the
1047 given byte order:
1048
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001049 *byteorder == -1: little endian
1050 *byteorder == 0: native order
1051 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001052
1053 In native mode, the first four bytes of the stream are checked for a
1054 BOM mark. If found, the BOM mark is analysed, the byte order
1055 adjusted and the BOM skipped. In the other modes, no BOM mark
1056 interpretation is done. After completion, *byteorder is set to the
1057 current byte order at the end of input data.
1058
1059 If byteorder is NULL, the codec starts in native order mode.
1060
1061*/
1062
1063PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064 const char *string, /* UTF-32 encoded string */
1065 Py_ssize_t length, /* size of string */
1066 const char *errors, /* error handling */
1067 int *byteorder /* pointer to byteorder to use
1068 0=native;-1=LE,1=BE; updated on
1069 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001070 );
1071
1072PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 const char *string, /* UTF-32 encoded string */
1074 Py_ssize_t length, /* size of string */
1075 const char *errors, /* error handling */
1076 int *byteorder, /* pointer to byteorder to use
1077 0=native;-1=LE,1=BE; updated on
1078 exit */
1079 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001080 );
1081
1082/* Returns a Python string using the UTF-32 encoding in native byte
1083 order. The string always starts with a BOM mark. */
1084
1085PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001086 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001087 );
1088
1089/* Returns a Python string object holding the UTF-32 encoded value of
1090 the Unicode data.
1091
1092 If byteorder is not 0, output is written according to the following
1093 byte order:
1094
1095 byteorder == -1: little endian
1096 byteorder == 0: native byte order (writes a BOM mark)
1097 byteorder == 1: big endian
1098
1099 If byteorder is 0, the output string will always start with the
1100 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1101 prepended.
1102
1103*/
1104
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001105#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001106PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 const Py_UNICODE *data, /* Unicode char buffer */
1108 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1109 const char *errors, /* error handling */
1110 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001111 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001112#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001113
Guido van Rossumd8225182000-03-10 22:33:05 +00001114/* --- UTF-16 Codecs ------------------------------------------------------ */
1115
Guido van Rossum9e896b32000-04-05 20:11:21 +00001116/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001117 the corresponding Unicode object.
1118
1119 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001121
1122 If byteorder is non-NULL, the decoder starts decoding using the
1123 given byte order:
1124
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 *byteorder == -1: little endian
1126 *byteorder == 0: native order
1127 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001128
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001129 In native mode, the first two bytes of the stream are checked for a
1130 BOM mark. If found, the BOM mark is analysed, the byte order
1131 adjusted and the BOM skipped. In the other modes, no BOM mark
1132 interpretation is done. After completion, *byteorder is set to the
1133 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001134
1135 If byteorder is NULL, the codec starts in native order mode.
1136
1137*/
1138
Mark Hammond91a681d2002-08-12 07:21:58 +00001139PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 const char *string, /* UTF-16 encoded string */
1141 Py_ssize_t length, /* size of string */
1142 const char *errors, /* error handling */
1143 int *byteorder /* pointer to byteorder to use
1144 0=native;-1=LE,1=BE; updated on
1145 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001146 );
1147
Walter Dörwald69652032004-09-07 20:24:22 +00001148PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 const char *string, /* UTF-16 encoded string */
1150 Py_ssize_t length, /* size of string */
1151 const char *errors, /* error handling */
1152 int *byteorder, /* pointer to byteorder to use
1153 0=native;-1=LE,1=BE; updated on
1154 exit */
1155 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001156 );
1157
Guido van Rossumd8225182000-03-10 22:33:05 +00001158/* Returns a Python string using the UTF-16 encoding in native byte
1159 order. The string always starts with a BOM mark. */
1160
Mark Hammond91a681d2002-08-12 07:21:58 +00001161PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001163 );
1164
1165/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001166 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001167
1168 If byteorder is not 0, output is written according to the following
1169 byte order:
1170
1171 byteorder == -1: little endian
1172 byteorder == 0: native byte order (writes a BOM mark)
1173 byteorder == 1: big endian
1174
1175 If byteorder is 0, the output string will always start with the
1176 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1177 prepended.
1178
1179 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1180 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001181 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001182
1183*/
1184
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001185#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001186PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001187 const Py_UNICODE *data, /* Unicode char buffer */
1188 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1189 const char *errors, /* error handling */
1190 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001191 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001192#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001193
1194/* --- Unicode-Escape Codecs ---------------------------------------------- */
1195
Mark Hammond91a681d2002-08-12 07:21:58 +00001196PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 const char *string, /* Unicode-Escape encoded string */
1198 Py_ssize_t length, /* size of string */
1199 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001200 );
1201
Mark Hammond91a681d2002-08-12 07:21:58 +00001202PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001204 );
1205
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001206#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001207PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 const Py_UNICODE *data, /* Unicode char buffer */
1209 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001210 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001211#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001212
1213/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1214
Mark Hammond91a681d2002-08-12 07:21:58 +00001215PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001216 const char *string, /* Raw-Unicode-Escape encoded string */
1217 Py_ssize_t length, /* size of string */
1218 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001219 );
1220
Mark Hammond91a681d2002-08-12 07:21:58 +00001221PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001223 );
1224
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001225#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001226PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 const Py_UNICODE *data, /* Unicode char buffer */
1228 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001229 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001230#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001231
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001232/* --- Unicode Internal Codec ---------------------------------------------
1233
1234 Only for internal use in _codecsmodule.c */
1235
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001236#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001237PyObject *_PyUnicode_DecodeUnicodeInternal(
1238 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001239 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001240 const char *errors
1241 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001242#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001243
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001245
1246 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1247
1248*/
1249
Mark Hammond91a681d2002-08-12 07:21:58 +00001250PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 const char *string, /* Latin-1 encoded string */
1252 Py_ssize_t length, /* size of string */
1253 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001254 );
1255
Mark Hammond91a681d2002-08-12 07:21:58 +00001256PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001258 );
1259
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001260#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001261PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1262 PyObject* unicode,
1263 const char* errors);
1264
Mark Hammond91a681d2002-08-12 07:21:58 +00001265PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 const Py_UNICODE *data, /* Unicode char buffer */
1267 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1268 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001269 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001270#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001271
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001273
1274 Only 7-bit ASCII data is excepted. All other codes generate errors.
1275
1276*/
1277
Mark Hammond91a681d2002-08-12 07:21:58 +00001278PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 const char *string, /* ASCII encoded string */
1280 Py_ssize_t length, /* size of string */
1281 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001282 );
1283
Mark Hammond91a681d2002-08-12 07:21:58 +00001284PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001286 );
1287
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001288#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1290 PyObject* unicode,
1291 const char* errors);
1292
Mark Hammond91a681d2002-08-12 07:21:58 +00001293PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 const Py_UNICODE *data, /* Unicode char buffer */
1295 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1296 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001297 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001298#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001299
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001301
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001303
1304 Decoding mappings must map single string characters to single
1305 Unicode characters, integers (which are then interpreted as Unicode
1306 ordinals) or None (meaning "undefined mapping" and causing an
1307 error).
1308
1309 Encoding mappings must map single Unicode characters to single
1310 string characters, integers (which are then interpreted as Latin-1
1311 ordinals) or None (meaning "undefined mapping" and causing an
1312 error).
1313
1314 If a character lookup fails with a LookupError, the character is
1315 copied as-is meaning that its ordinal value will be interpreted as
1316 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1317 to contain those mappings which map characters to different code
1318 points.
1319
1320*/
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 const char *string, /* Encoded string */
1324 Py_ssize_t length, /* size of string */
1325 PyObject *mapping, /* character mapping
1326 (char ordinal -> unicode ordinal) */
1327 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001328 );
1329
Mark Hammond91a681d2002-08-12 07:21:58 +00001330PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331 PyObject *unicode, /* Unicode object */
1332 PyObject *mapping /* character mapping
1333 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001334 );
1335
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001336#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001337PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 const Py_UNICODE *data, /* Unicode char buffer */
1339 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1340 PyObject *mapping, /* character mapping
1341 (unicode ordinal -> char ordinal) */
1342 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001343 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001344#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001345
1346/* Translate a Py_UNICODE buffer of the given length by applying a
1347 character mapping table to it and return the resulting Unicode
1348 object.
1349
1350 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001352
1353 Mapping tables may be dictionaries or sequences. Unmapped character
1354 ordinals (ones which cause a LookupError) are left untouched and
1355 are copied as-is.
1356
1357*/
1358
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001359#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001360PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001361 const Py_UNICODE *data, /* Unicode char buffer */
1362 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1363 PyObject *table, /* Translate table */
1364 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001365 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001366#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001367
Victor Stinner99b95382011-07-04 14:23:54 +02001368#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001369
Guido van Rossumefec1152000-03-28 02:01:15 +00001370/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001371
Mark Hammond91a681d2002-08-12 07:21:58 +00001372PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001373 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001374 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001375 const char *errors /* error handling */
1376 );
1377
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001378PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1379 const char *string, /* MBCS encoded string */
1380 Py_ssize_t length, /* size of string */
1381 const char *errors, /* error handling */
1382 Py_ssize_t *consumed /* bytes consumed */
1383 );
1384
Mark Hammond91a681d2002-08-12 07:21:58 +00001385PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001386 PyObject *unicode /* Unicode object */
1387 );
1388
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001389#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001390PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001391 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001392 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001393 const char *errors /* error handling */
1394 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001395#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001396
Victor Stinner99b95382011-07-04 14:23:54 +02001397#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001398
Guido van Rossum9e896b32000-04-05 20:11:21 +00001399/* --- Decimal Encoder ---------------------------------------------------- */
1400
1401/* Takes a Unicode string holding a decimal value and writes it into
1402 an output buffer using standard ASCII digit codes.
1403
1404 The output buffer has to provide at least length+1 bytes of storage
1405 area. The output string is 0-terminated.
1406
1407 The encoder converts whitespace to ' ', decimal characters to their
1408 corresponding ASCII digit and all other Latin-1 characters except
1409 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1410 are treated as errors. This includes embedded NULL bytes.
1411
1412 Error handling is defined by the errors argument:
1413
1414 NULL or "strict": raise a ValueError
1415 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001417 "replace": replaces illegal characters with '?'
1418
1419 Returns 0 on success, -1 on failure.
1420
1421*/
1422
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001423#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001424PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 Py_UNICODE *s, /* Unicode buffer */
1426 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1427 char *output, /* Output buffer; must have size >= length */
1428 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001429 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001430#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001431
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001432/* Transforms code points that have decimal digit property to the
1433 corresponding ASCII digit code points.
1434
1435 Returns a new Unicode string on success, NULL on failure.
1436*/
1437
Georg Brandlb5503082010-12-05 11:40:48 +00001438#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001439PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1440 Py_UNICODE *s, /* Unicode buffer */
1441 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1442 );
Georg Brandlb5503082010-12-05 11:40:48 +00001443#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1446 as argument instead of a raw buffer and length. This function additionally
1447 transforms spaces to ASCII because this is what the callers in longobject,
1448 floatobject, and complexobject did anyways. */
1449
1450#ifndef Py_LIMITED_API
1451PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1452 PyObject *unicode /* Unicode object */
1453 );
1454#endif
1455
Martin v. Löwis011e8422009-05-05 04:43:17 +00001456/* --- File system encoding ---------------------------------------------- */
1457
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001458/* ParseTuple converter: encode str objects to bytes using
1459 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001460
1461PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1462
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001463/* ParseTuple converter: decode bytes objects to unicode using
1464 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1465
1466PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1467
Victor Stinner77c38622010-05-14 15:58:55 +00001468/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1469 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001470
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001471 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1472 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001473
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001474 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001475*/
1476
1477PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1478 const char *s /* encoded string */
1479 );
1480
Victor Stinner77c38622010-05-14 15:58:55 +00001481/* Decode a string using Py_FileSystemDefaultEncoding
1482 and the "surrogateescape" error handler.
1483
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001484 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1485 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001486*/
1487
Martin v. Löwis011e8422009-05-05 04:43:17 +00001488PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1489 const char *s, /* encoded string */
1490 Py_ssize_t size /* size */
1491 );
1492
Victor Stinnerae6265f2010-05-15 16:27:27 +00001493/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001494 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001495
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001496 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1497 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001498*/
1499
1500PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1501 PyObject *unicode
1502 );
1503
Guido van Rossumd8225182000-03-10 22:33:05 +00001504/* --- Methods & Slots ----------------------------------------------------
1505
1506 These are capable of handling Unicode objects and strings on input
1507 (we refer to them as strings in the descriptions) and return
1508 Unicode objects or integers as apporpriate. */
1509
1510/* Concat two strings giving a new Unicode string. */
1511
Mark Hammond91a681d2002-08-12 07:21:58 +00001512PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 PyObject *left, /* Left string */
1514 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001515 );
1516
Walter Dörwald1ab83302007-05-18 17:15:44 +00001517/* Concat two strings and put the result in *pleft
1518 (sets *pleft to NULL on error) */
1519
1520PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001521 PyObject **pleft, /* Pointer to left string */
1522 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001523 );
1524
1525/* Concat two strings, put the result in *pleft and drop the right object
1526 (sets *pleft to NULL on error) */
1527
1528PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001529 PyObject **pleft, /* Pointer to left string */
1530 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001531 );
1532
Guido van Rossumd8225182000-03-10 22:33:05 +00001533/* Split a string giving a list of Unicode strings.
1534
1535 If sep is NULL, splitting will be done at all whitespace
1536 substrings. Otherwise, splits occur at the given separator.
1537
1538 At most maxsplit splits will be done. If negative, no limit is set.
1539
1540 Separators are not included in the resulting list.
1541
1542*/
1543
Mark Hammond91a681d2002-08-12 07:21:58 +00001544PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 PyObject *s, /* String to split */
1546 PyObject *sep, /* String separator */
1547 Py_ssize_t maxsplit /* Maxsplit count */
1548 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001549
1550/* Dito, but split at line breaks.
1551
1552 CRLF is considered to be one line break. Line breaks are not
1553 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554
Mark Hammond91a681d2002-08-12 07:21:58 +00001555PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001556 PyObject *s, /* String to split */
1557 int keepends /* If true, line end markers are included */
1558 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001559
Thomas Wouters477c8d52006-05-27 19:21:47 +00001560/* Partition a string using a given separator. */
1561
1562PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 PyObject *s, /* String to partition */
1564 PyObject *sep /* String separator */
1565 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001566
1567/* Partition a string using a given separator, searching from the end of the
1568 string. */
1569
1570PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001571 PyObject *s, /* String to partition */
1572 PyObject *sep /* String separator */
1573 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001574
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001575/* Split a string giving a list of Unicode strings.
1576
1577 If sep is NULL, splitting will be done at all whitespace
1578 substrings. Otherwise, splits occur at the given separator.
1579
1580 At most maxsplit splits will be done. But unlike PyUnicode_Split
1581 PyUnicode_RSplit splits from the end of the string. If negative,
1582 no limit is set.
1583
1584 Separators are not included in the resulting list.
1585
1586*/
1587
1588PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001589 PyObject *s, /* String to split */
1590 PyObject *sep, /* String separator */
1591 Py_ssize_t maxsplit /* Maxsplit count */
1592 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001593
Guido van Rossumd8225182000-03-10 22:33:05 +00001594/* Translate a string by applying a character mapping table to it and
1595 return the resulting Unicode object.
1596
1597 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001598 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001599
1600 Mapping tables may be dictionaries or sequences. Unmapped character
1601 ordinals (ones which cause a LookupError) are left untouched and
1602 are copied as-is.
1603
1604*/
1605
Mark Hammond91a681d2002-08-12 07:21:58 +00001606PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 PyObject *str, /* String */
1608 PyObject *table, /* Translate table */
1609 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001610 );
1611
1612/* Join a sequence of strings using the given separator and return
1613 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001614
Mark Hammond91a681d2002-08-12 07:21:58 +00001615PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001616 PyObject *separator, /* Separator string */
1617 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001618 );
1619
1620/* Return 1 if substr matches str[start:end] at the given tail end, 0
1621 otherwise. */
1622
Martin v. Löwis18e16552006-02-15 17:27:45 +00001623PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001624 PyObject *str, /* String */
1625 PyObject *substr, /* Prefix or Suffix string */
1626 Py_ssize_t start, /* Start index */
1627 Py_ssize_t end, /* Stop index */
1628 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001629 );
1630
1631/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001632 given search direction or -1 if not found. -2 is returned in case
1633 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001634
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 PyObject *str, /* String */
1637 PyObject *substr, /* Substring to find */
1638 Py_ssize_t start, /* Start index */
1639 Py_ssize_t end, /* Stop index */
1640 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001641 );
1642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643/* Like PyUnicode_Find, but search for single character only. */
1644PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1645 PyObject *str,
1646 Py_UCS4 ch,
1647 Py_ssize_t start,
1648 Py_ssize_t end,
1649 int direction
1650 );
1651
Barry Warsaw51ac5802000-03-20 16:36:48 +00001652/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001653
Martin v. Löwis18e16552006-02-15 17:27:45 +00001654PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 PyObject *str, /* String */
1656 PyObject *substr, /* Substring to count */
1657 Py_ssize_t start, /* Start index */
1658 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001659 );
1660
Barry Warsaw51ac5802000-03-20 16:36:48 +00001661/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001662 and return the resulting Unicode object. */
1663
Mark Hammond91a681d2002-08-12 07:21:58 +00001664PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 PyObject *str, /* String */
1666 PyObject *substr, /* Substring to find */
1667 PyObject *replstr, /* Substring to replace */
1668 Py_ssize_t maxcount /* Max. number of replacements to apply;
1669 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001670 );
1671
1672/* Compare two strings and return -1, 0, 1 for less than, equal,
1673 greater than resp. */
1674
Mark Hammond91a681d2002-08-12 07:21:58 +00001675PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676 PyObject *left, /* Left string */
1677 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001678 );
1679
Martin v. Löwis5b222132007-06-10 09:51:05 +00001680PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1681 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001682 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001683 );
1684
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001685/* Rich compare two strings and return one of the following:
1686
1687 - NULL in case an exception was raised
1688 - Py_True or Py_False for successfuly comparisons
1689 - Py_NotImplemented in case the type combination is unknown
1690
1691 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1692 case the conversion of the arguments to Unicode fails with a
1693 UnicodeDecodeError.
1694
1695 Possible values for op:
1696
1697 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1698
1699*/
1700
1701PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 PyObject *left, /* Left string */
1703 PyObject *right, /* Right string */
1704 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001705 );
1706
Thomas Wouters7e474022000-07-16 12:04:32 +00001707/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001708 the resulting Unicode string. */
1709
Mark Hammond91a681d2002-08-12 07:21:58 +00001710PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 PyObject *format, /* Format string */
1712 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001713 );
1714
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001715/* Checks whether element is contained in container and return 1/0
1716 accordingly.
1717
1718 element has to coerce to an one element Unicode string. -1 is
1719 returned in case of an error. */
1720
Mark Hammond91a681d2002-08-12 07:21:58 +00001721PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 PyObject *container, /* Container string */
1723 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001724 );
1725
Martin v. Löwis47383402007-08-15 07:32:56 +00001726/* Checks whether argument is a valid identifier. */
1727
1728PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1729
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001730#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001731/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001732PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001733 PyUnicodeObject *self,
1734 int striptype,
1735 PyObject *sepobj
1736 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001737#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001738
Eric Smith5807c412008-05-11 21:00:57 +00001739/* Using the current locale, insert the thousands grouping
1740 into the string pointed to by buffer. For the argument descriptions,
1741 see Objects/stringlib/localeutil.h */
1742
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001743#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001744PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1745 Py_ssize_t n_buffer,
1746 Py_UNICODE *digits,
1747 Py_ssize_t n_digits,
1748 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001749#endif
Eric Smith5807c412008-05-11 21:00:57 +00001750
Eric Smitha3b1ac82009-04-03 14:45:06 +00001751/* Using explicit passed-in values, insert the thousands grouping
1752 into the string pointed to by buffer. For the argument descriptions,
1753 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001754#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1756 int kind,
1757 void *buffer,
1758 Py_ssize_t n_buffer,
1759 void *digits,
1760 Py_ssize_t n_digits,
1761 Py_ssize_t min_width,
1762 const char *grouping,
1763 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001764#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001765/* === Characters Type APIs =============================================== */
1766
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001767/* Helper array used by Py_UNICODE_ISSPACE(). */
1768
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001769#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001770PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1771
Guido van Rossumd8225182000-03-10 22:33:05 +00001772/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001773 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001774
1775 These APIs are implemented in Objects/unicodectype.c.
1776
1777*/
1778
Mark Hammond91a681d2002-08-12 07:21:58 +00001779PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001780 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001781 );
1782
Mark Hammond91a681d2002-08-12 07:21:58 +00001783PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001784 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001785 );
1786
Mark Hammond91a681d2002-08-12 07:21:58 +00001787PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001788 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001789 );
1790
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001791PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001792 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001793 );
1794
1795PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001796 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001797 );
1798
Mark Hammond91a681d2002-08-12 07:21:58 +00001799PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001800 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001801 );
1802
Mark Hammond91a681d2002-08-12 07:21:58 +00001803PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001804 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001805 );
1806
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001807PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1808 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001809 );
1810
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001811PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1812 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001813 );
1814
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001815PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1816 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001817 );
1818
Mark Hammond91a681d2002-08-12 07:21:58 +00001819PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001820 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001821 );
1822
Mark Hammond91a681d2002-08-12 07:21:58 +00001823PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001824 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001825 );
1826
Mark Hammond91a681d2002-08-12 07:21:58 +00001827PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001828 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001829 );
1830
Mark Hammond91a681d2002-08-12 07:21:58 +00001831PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001832 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001833 );
1834
Mark Hammond91a681d2002-08-12 07:21:58 +00001835PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001836 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001837 );
1838
Mark Hammond91a681d2002-08-12 07:21:58 +00001839PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001840 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001841 );
1842
Georg Brandl559e5d72008-06-11 18:37:52 +00001843PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001844 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001845 );
1846
Mark Hammond91a681d2002-08-12 07:21:58 +00001847PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001848 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001849 );
1850
Victor Stinneref8d95c2010-08-16 22:03:11 +00001851PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1852 const Py_UNICODE *u
1853 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001854
1855PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001856 Py_UNICODE *s1,
1857 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001858
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001859PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1860 Py_UNICODE *s1, const Py_UNICODE *s2);
1861
Martin v. Löwis5b222132007-06-10 09:51:05 +00001862PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001863 Py_UNICODE *s1,
1864 const Py_UNICODE *s2,
1865 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001866
1867PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001868 const Py_UNICODE *s1,
1869 const Py_UNICODE *s2
1870 );
1871
1872PyAPI_FUNC(int) Py_UNICODE_strncmp(
1873 const Py_UNICODE *s1,
1874 const Py_UNICODE *s2,
1875 size_t n
1876 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001877
1878PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001879 const Py_UNICODE *s,
1880 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001881 );
1882
Victor Stinner331ea922010-08-10 16:37:20 +00001883PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001884 const Py_UNICODE *s,
1885 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001886 );
1887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888PyAPI_FUNC(size_t) Py_UCS4_strlen(
1889 const Py_UCS4 *u
1890 );
1891
1892PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1893 Py_UCS4 *s1,
1894 const Py_UCS4 *s2);
1895
1896PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1897 Py_UCS4 *s1, const Py_UCS4 *s2);
1898
1899PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1900 Py_UCS4 *s1,
1901 const Py_UCS4 *s2,
1902 size_t n);
1903
1904PyAPI_FUNC(int) Py_UCS4_strcmp(
1905 const Py_UCS4 *s1,
1906 const Py_UCS4 *s2
1907 );
1908
1909PyAPI_FUNC(int) Py_UCS4_strncmp(
1910 const Py_UCS4 *s1,
1911 const Py_UCS4 *s2,
1912 size_t n
1913 );
1914
1915PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1916 const Py_UCS4 *s,
1917 Py_UCS4 c
1918 );
1919
1920PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1921 const Py_UCS4 *s,
1922 Py_UCS4 c
1923 );
1924
Victor Stinner71133ff2010-09-01 23:43:53 +00001925/* Create a copy of a unicode string ending with a nul character. Return NULL
1926 and raise a MemoryError exception on memory allocation failure, otherwise
1927 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1928
Victor Stinner46408602010-09-03 16:18:00 +00001929PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001930 PyObject *unicode
1931 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001932#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001933
Guido van Rossumd8225182000-03-10 22:33:05 +00001934#ifdef __cplusplus
1935}
1936#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001937#endif /* !Py_UNICODEOBJECT_H */