blob: 314dee455b44b786454927a9b3b57bc5864a1b58 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
209 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200210 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000211 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212 struct {
213 /*
214 SSTATE_NOT_INTERNED (0)
215 SSTATE_INTERNED_MORTAL (1)
216 SSTATE_INTERNED_IMMORTAL (2)
217
218 If interned != SSTATE_NOT_INTERNED, the two references from the
219 dictionary to this object are *not* counted in ob_refcnt.
220 */
221 unsigned int interned:2;
222 /* Character size:
223
224 PyUnicode_WCHAR_KIND (0): wchar_t*
225 PyUnicode_1BYTE_KIND (1): Py_UCS1*
226 PyUnicode_2BYTE_KIND (2): Py_UCS2*
227 PyUnicode_4BYTE_KIND (3): Py_UCS4*
228 */
229 unsigned int kind:2;
230 /* Compact is with respect to the allocation scheme. Compact unicode
231 objects only require one memory block while non-compact objects use
232 one block for the PyUnicodeObject struct and another for its data
233 buffer. */
234 unsigned int compact:1;
235 /* Compact objects which are ASCII-only also have the state.compact
236 flag set, and use the PyASCIIObject struct. */
237 unsigned int ascii:1;
238 /* The ready flag indicates whether the object layout is initialized
239 completely. This means that this is either a compact object, or
240 the data pointer is filled out. The bit is redundant, and helps
241 to minimize the test in PyUnicode_IS_READY(). */
242 unsigned int ready:1;
243 } state;
244 wchar_t *wstr; /* wchar_t representation (null-terminated) */
245} PyASCIIObject;
246
247/* Non-ASCII strings allocated through PyUnicode_New use the
248 PyCompactUnicodeOject structure. state.compact is set, and the data
249 immediately follow the structure. */
250typedef struct {
251 PyASCIIObject _base;
252 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
253 * terminating \0. */
254 char *utf8; /* UTF-8 representation (null-terminated) */
255 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
256 * surrogates count as two code points. */
257} PyCompactUnicodeObject;
258
259/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
260 PyUnicodeObject structure. The actual string data is initially in the wstr
261 block, and copied into the data block using PyUnicode_Ready. */
262typedef struct {
263 PyCompactUnicodeObject _base;
264 union {
265 void *any;
266 Py_UCS1 *latin1;
267 Py_UCS2 *ucs2;
268 Py_UCS4 *ucs4;
269 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000270} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000271#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000272
Mark Hammond91a681d2002-08-12 07:21:58 +0000273PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000274PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000275
Thomas Wouters27d517b2007-02-25 20:39:11 +0000276#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000277 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
278#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000279
280/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000281#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282
283#define PyUnicode_WSTR_LENGTH(op) \
284 (((PyASCIIObject*)op)->state.ascii ? \
285 ((PyASCIIObject*)op)->length : \
286 ((PyCompactUnicodeObject*)op)->wstr_length)
287
288/* Returns the deprecated Py_UNICODE representation's size in code units
289 (this includes surrogate pairs as 2 units).
290 If the Py_UNICODE representation is not available, it will be computed
291 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
292
Guido van Rossumd8225182000-03-10 22:33:05 +0000293#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 (assert(PyUnicode_Check(op)), \
295 (((PyASCIIObject *)(op))->wstr) ? \
296 PyUnicode_WSTR_LENGTH(op) : \
297 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
298 PyUnicode_WSTR_LENGTH(op)))
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
302
303/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
304 representation on demand. Using this macro is very inefficient now,
305 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
306 use PyUnicode_WRITE() and PyUnicode_READ(). */
307
Guido van Rossumd8225182000-03-10 22:33:05 +0000308#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 (assert(PyUnicode_Check(op)), \
310 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
311 PyUnicode_AsUnicode((PyObject *)(op)))
312
Guido van Rossumd8225182000-03-10 22:33:05 +0000313#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 ((const char *)(PyUnicode_AS_UNICODE(op)))
315
316
317/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
318
319/* Values for PyUnicodeObject.state: */
320
321/* Interning state. */
322#define SSTATE_NOT_INTERNED 0
323#define SSTATE_INTERNED_MORTAL 1
324#define SSTATE_INTERNED_IMMORTAL 2
325
326#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
327
328/* String contains only wstr byte characters. This is only possible
329 when the string was created with a legacy API and PyUnicode_Ready()
330 has not been called yet. */
331#define PyUnicode_WCHAR_KIND 0
332
333/* Return values of the PyUnicode_KIND() macro: */
334
335#define PyUnicode_1BYTE_KIND 1
336#define PyUnicode_2BYTE_KIND 2
337#define PyUnicode_4BYTE_KIND 3
338
339
340/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200341 this can be 1, 2 or 4.
342
343 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344#define PyUnicode_CHARACTER_SIZE(op) \
345 (1 << (PyUnicode_KIND(op) - 1))
346
347/* Return pointers to the canonical representation casted as unsigned char,
348 Py_UCS2, or Py_UCS4 for direct character access.
349 No checks are performed, use PyUnicode_CHARACTER_SIZE or
350 PyUnicode_KIND() before to ensure these will work correctly. */
351
352#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
353#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
354#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
355
356/* Return true if the string is compact or 0 if not.
357 No type checks or Ready calls are performed. */
358#define PyUnicode_IS_COMPACT(op) \
359 (((PyASCIIObject*)(op))->state.compact)
360
Victor Stinner157f83f2011-09-28 21:41:31 +0200361/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362#define PyUnicode_KIND(op) \
363 (assert(PyUnicode_Check(op)), \
364 assert(PyUnicode_IS_READY(op)), \
365 ((PyASCIIObject *)(op))->state.kind)
366
Victor Stinner157f83f2011-09-28 21:41:31 +0200367/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368#define _PyUnicode_COMPACT_DATA(op) \
369 (PyUnicode_IS_COMPACT_ASCII(op) ? \
370 ((void*)((PyASCIIObject*)(op) + 1)) : \
371 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
372
373#define _PyUnicode_NONCOMPACT_DATA(op) \
374 (assert(((PyUnicodeObject*)(op))->data.any), \
375 ((((PyUnicodeObject *)(op))->data.any)))
376
377#define PyUnicode_DATA(op) \
378 (assert(PyUnicode_Check(op)), \
379 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
380 _PyUnicode_NONCOMPACT_DATA(op))
381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200383 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384
Victor Stinner4584a5b2011-10-01 02:39:37 +0200385 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
387
388/* In the access macros below, "kind" may be evaluated more than once.
389 All other macro parameters are evaluated exactly once, so it is safe
390 to put side effects into them (such as increasing the index). */
391
392/* Write into the canonical representation, this macro does not do any sanity
393 checks and is intended for usage in loops. The caller should cache the
394 kind and data pointers optained form other macro calls.
395 index is the index in the string (starts at 0) and value is the new
396 code point value which shoule be written to that location. */
397#define PyUnicode_WRITE(kind, data, index, value) \
398 do { \
399 switch ((kind)) { \
400 case PyUnicode_1BYTE_KIND: { \
401 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
402 break; \
403 } \
404 case PyUnicode_2BYTE_KIND: { \
405 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
406 break; \
407 } \
408 default: { \
409 assert((kind) == PyUnicode_4BYTE_KIND); \
410 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
411 } \
412 } \
413 } while (0)
414
415/* Read a code point form the string's canonical representation. No checks
416 or ready calls are performed. */
417#define PyUnicode_READ(kind, data, index) \
418 ((Py_UCS4) \
419 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200420 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200421 ((kind) == PyUnicode_2BYTE_KIND ? \
422 ((const Py_UCS2 *)(data))[(index)] : \
423 ((const Py_UCS4 *)(data))[(index)] \
424 ) \
425 ))
426
427/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
428 calls PyUnicode_KIND() and might call it twice. For single reads, use
429 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
430 cache kind and use PyUnicode_READ instead. */
431#define PyUnicode_READ_CHAR(unicode, index) \
432 ((Py_UCS4) \
433 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200434 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
436 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
437 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
438 ) \
439 ))
440
441/* Returns the length of the unicode string. The caller has to make sure that
442 the string has it's canonical representation set before calling
443 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
444#define PyUnicode_GET_LENGTH(op) \
445 (assert(PyUnicode_Check(op)), \
446 assert(PyUnicode_IS_READY(op)), \
447 ((PyASCIIObject *)(op))->length)
448
449
450/* Fast check to determine whether an object is ready. Equivalent to
451 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
452
453#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
454
455/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
456 case. If the canonical representation is not yet set, it will still call
457 PyUnicode_Ready().
458 Returns 0 on success and -1 on errors. */
459#define PyUnicode_READY(op) \
460 (assert(PyUnicode_Check(op)), \
461 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200462 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464/* Return a maximum character value which is suitable for creating another
465 string based on op. This is always an approximation but more efficient
466 than interating over the string. */
467#define PyUnicode_MAX_CHAR_VALUE(op) \
468 (assert(PyUnicode_IS_READY(op)), \
469 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
470 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
471 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
472 (0x7fU) : (0xffU) \
473 ) : \
474 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
475 (0xffffU) : (0x10ffffU) \
476 ))))
477
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000478#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000479
480/* --- Constants ---------------------------------------------------------- */
481
482/* This Unicode character will be used as replacement character during
483 decoding if the errors argument is set to "replace". Note: the
484 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
485 Unicode 3.0. */
486
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200487#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000488
489/* === Public API ========================================================= */
490
491/* --- Plain Py_UNICODE --------------------------------------------------- */
492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200493/* With PEP 393, this is the recommended way to allocate a new unicode object.
494 This function will allocate the object and its buffer in a single memory
495 block. Objects created using this function are not resizable. */
496#ifndef Py_LIMITED_API
497PyAPI_FUNC(PyObject*) PyUnicode_New(
498 Py_ssize_t size, /* Number of code points in the new string */
499 Py_UCS4 maxchar /* maximum code point value in the string */
500 );
501#endif
502
Victor Stinnerd8f65102011-09-29 19:43:17 +0200503/* Initializes the canonical string representation from a the deprecated
504 wstr/Py_UNICODE representation. This function is used to convert Unicode
505 objects which were created using the old API to the new flexible format
506 introduced with PEP 393.
507
508 Don't call this function directly, use the public PyUnicode_READY() macro
509 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200510#ifndef Py_LIMITED_API
511PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200512 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513 );
514#endif
515
Victor Stinner034f6cf2011-09-30 02:26:44 +0200516/* Get a copy of a Unicode string. */
517PyAPI_FUNC(PyObject*) PyUnicode_Copy(
518 PyObject *unicode
519 );
520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200522 character conversion when necessary and falls back to memcpy if possible.
523
Victor Stinnera0702ab2011-09-29 14:14:38 +0200524 Fail if to is too small (smaller than how_many or smaller than
525 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
526 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200527
528 Return the number of written character, or return -1 and raise an exception
529 on error.
530
531 Pseudo-code:
532
533 how_many = min(how_many, len(from) - from_start)
534 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
535 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200536
537 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200538 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200540PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541 PyObject *to,
542 Py_ssize_t to_start,
543 PyObject *from,
544 Py_ssize_t from_start,
545 Py_ssize_t how_many
546 );
547#endif
548
Guido van Rossumd8225182000-03-10 22:33:05 +0000549/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000551
552 u may be NULL which causes the contents to be undefined. It is the
553 user's responsibility to fill in the needed data afterwards. Note
554 that modifying the Unicode object contents after construction is
555 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000556
557 The buffer is copied into the new object. */
558
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000559#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000560PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000561 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000562 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000563 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000564#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000565
Georg Brandl952867a2010-06-27 10:17:12 +0000566/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000567PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000568 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000569 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000570 );
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000574PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000575 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000576 );
577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200578#ifndef Py_LIMITED_API
579PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
580 int kind,
581 const void *buffer,
582 Py_ssize_t size);
583#endif
584
585PyAPI_FUNC(PyObject*) PyUnicode_Substring(
586 PyObject *str,
587 Py_ssize_t start,
588 Py_ssize_t end);
589
590/* Copy the string into a UCS4 buffer including the null character is copy_null
591 is set. Return NULL and raise an exception on error. Raise a ValueError if
592 the buffer is smaller than the string. Return buffer on success.
593
594 buflen is the length of the buffer in (Py_UCS4) characters. */
595PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
596 PyObject *unicode,
597 Py_UCS4* buffer,
598 Py_ssize_t buflen,
599 int copy_null);
600
601/* Copy the string into a UCS4 buffer. A new buffer is allocated using
602 * PyMem_Malloc; if this fails, NULL is returned with a memory error
603 exception set. */
604PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
605
Guido van Rossumd8225182000-03-10 22:33:05 +0000606/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607 Py_UNICODE buffer.
608 If the wchar_t/Py_UNICODE representation is not yet available, this
609 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000610
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000611#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000612PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000613 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000614 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000615#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617/* Return a read-only pointer to the Unicode object's internal
618 Py_UNICODE buffer and save the length at size.
619 If the wchar_t/Py_UNICODE representation is not yet available, this
620 function will calculate it. */
621
622#ifndef Py_LIMITED_API
623PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
624 PyObject *unicode, /* Unicode object */
625 Py_ssize_t *size /* location where to save the length */
626 );
627#endif
628
Guido van Rossumd8225182000-03-10 22:33:05 +0000629/* Get the length of the Unicode object. */
630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200631PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
632 PyObject *unicode
633);
634
Victor Stinner157f83f2011-09-28 21:41:31 +0200635/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200636 string representation. */
637
Martin v. Löwis18e16552006-02-15 17:27:45 +0000638PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000640 );
641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642/* Read a character from the string. */
643
644PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
645 PyObject *unicode,
646 Py_ssize_t index
647 );
648
649/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200650 PyUnicode_New, must not be shared, and must not have been hashed yet.
651
652 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653
654PyAPI_FUNC(int) PyUnicode_WriteChar(
655 PyObject *unicode,
656 Py_ssize_t index,
657 Py_UCS4 character
658 );
659
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000660#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000661/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000662PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000663#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000664
Guido van Rossum52c23592000-04-10 13:41:41 +0000665/* Resize an already allocated Unicode object to the new size length.
666
667 *unicode is modified to point to the new (resized) object and 0
668 returned on success.
669
670 This API may only be called by the function which also called the
671 Unicode constructor. The refcount on the object must be 1. Otherwise,
672 an error is returned.
673
674 Error handling is implemented as follows: an exception is set, -1
675 is returned and *unicode left untouched.
676
677*/
678
Mark Hammond91a681d2002-08-12 07:21:58 +0000679PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000680 PyObject **unicode, /* Pointer to the Unicode object */
681 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000682 );
683
Guido van Rossumd8225182000-03-10 22:33:05 +0000684/* Coerce obj to an Unicode object and return a reference with
685 *incremented* refcount.
686
687 Coercion is done in the following way:
688
Georg Brandl952867a2010-06-27 10:17:12 +0000689 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000690 under the assumptions that they contain data using the UTF-8
691 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000692
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000693 2. All other objects (including Unicode objects) raise an
694 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000695
696 The API returns NULL in case of an error. The caller is responsible
697 for decref'ing the returned objects.
698
699*/
700
Mark Hammond91a681d2002-08-12 07:21:58 +0000701PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000702 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000703 const char *encoding, /* encoding */
704 const char *errors /* error handling */
705 );
706
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000707/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000708 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000710 Unicode objects are passed back as-is (subclasses are converted to
711 true Unicode objects), all other objects are delegated to
712 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000713 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000714
715 The API returns NULL in case of an error. The caller is responsible
716 for decref'ing the returned objects.
717
718*/
719
Mark Hammond91a681d2002-08-12 07:21:58 +0000720PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000721 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000722 );
723
Victor Stinner1205f272010-09-11 00:54:47 +0000724PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
725 const char *format, /* ASCII-encoded string */
726 va_list vargs
727 );
728PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
729 const char *format, /* ASCII-encoded string */
730 ...
731 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000733#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000734/* Format the object based on the format_spec, as defined in PEP 3101
735 (Advanced String Formatting). */
736PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200737 PyObject *format_spec,
738 Py_ssize_t start,
739 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000740#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000741
Walter Dörwald16807132007-05-25 13:52:07 +0000742PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
743PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000744PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
745 const char *u /* UTF-8 encoded string */
746 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000747#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000748PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000749#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000750
751/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200752#define PyUnicode_CHECK_INTERNED(op) \
753 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000754
Guido van Rossumd8225182000-03-10 22:33:05 +0000755/* --- wchar_t support for platforms which support it --------------------- */
756
757#ifdef HAVE_WCHAR_H
758
Georg Brandl952867a2010-06-27 10:17:12 +0000759/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000760 size.
761
762 The buffer is copied into the new object. */
763
Mark Hammond91a681d2002-08-12 07:21:58 +0000764PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000765 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000766 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000767 );
768
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000769/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000770 most size wchar_t characters are copied.
771
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000772 Note that the resulting wchar_t string may or may not be
773 0-terminated. It is the responsibility of the caller to make sure
774 that the wchar_t string is 0-terminated in case this is required by
775 the application.
776
777 Returns the number of wchar_t characters copied (excluding a
778 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000779 error. */
780
Martin v. Löwis18e16552006-02-15 17:27:45 +0000781PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000782 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000783 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000784 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000785 );
786
Victor Stinner137c34c2010-09-29 10:25:54 +0000787/* Convert the Unicode object to a wide character string. The output string
788 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200789 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000790
791 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
792 on success. On error, returns NULL, *size is undefined and raises a
793 MemoryError. */
794
795PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000796 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000797 Py_ssize_t *size /* number of characters of the result */
798 );
799
Victor Stinner9f789e72011-10-01 03:57:28 +0200800#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200802#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803
Guido van Rossumd8225182000-03-10 22:33:05 +0000804#endif
805
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000806/* --- Unicode ordinals --------------------------------------------------- */
807
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808/* Create a Unicode Object from the given Unicode code point ordinal.
809
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000810 The ordinal must be in range(0x10000) on narrow Python builds
811 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
812 raised in case it is not.
813
814*/
815
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000816PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000817
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000818/* --- Free-list management ----------------------------------------------- */
819
820/* Clear the free list used by the Unicode implementation.
821
822 This can be used to release memory used for objects on the free
823 list back to the Python memory allocator.
824
825*/
826
827PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
828
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000830
831 Many of these APIs take two arguments encoding and errors. These
832 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000833 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000834
Georg Brandl952867a2010-06-27 10:17:12 +0000835 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000836
837 Error handling is set by errors which may also be set to NULL
838 meaning to use the default handling defined for the codec. Default
839 error handling for all builtin codecs is "strict" (ValueErrors are
840 raised).
841
842 The codecs all use a similar interface. Only deviation from the
843 generic ones are documented.
844
845*/
846
Fred Drakecb093fe2000-05-09 19:51:53 +0000847/* --- Manage the default encoding ---------------------------------------- */
848
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000849/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000850 Unicode object unicode and the size of the encoded representation
851 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000852
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000853 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000854
Victor Stinner157f83f2011-09-28 21:41:31 +0200855 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856 and subsequent calls will return the same string. The memory is relased
857 when the unicodeobject is deallocated.
858
859 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
860 support the previous internal function with the same behaviour.
861
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000862 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000863 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000864
865 *** If you need to access the Unicode object as UTF-8 bytes string,
866 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000867*/
868
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000869#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000871 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000872 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000874#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000875
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000876/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000877 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
880 in the unicodeobject.
881
882 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
883 support the previous internal function with the same behaviour.
884
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000885 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000886 extracted from the returned data.
887
888 *** This API is for interpreter INTERNAL USE ONLY and will likely
889 *** be removed or changed for Python 3.1.
890
891 *** If you need to access the Unicode object as UTF-8 bytes string,
892 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000893
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000894*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000895
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000896#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
898#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000899#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000900
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000901/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000902
Mark Hammond91a681d2002-08-12 07:21:58 +0000903PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000904
Guido van Rossumd8225182000-03-10 22:33:05 +0000905/* --- Generic Codecs ----------------------------------------------------- */
906
907/* Create a Unicode object by decoding the encoded string s of the
908 given size. */
909
Mark Hammond91a681d2002-08-12 07:21:58 +0000910PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000911 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000912 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000913 const char *encoding, /* encoding */
914 const char *errors /* error handling */
915 );
916
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000917/* Decode a Unicode object unicode and return the result as Python
918 object. */
919
920PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921 PyObject *unicode, /* Unicode object */
922 const char *encoding, /* encoding */
923 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000924 );
925
926/* Decode a Unicode object unicode and return the result as Unicode
927 object. */
928
929PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930 PyObject *unicode, /* Unicode object */
931 const char *encoding, /* encoding */
932 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000933 );
934
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000936 Python string object. */
937
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000938#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000939PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000940 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000941 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000942 const char *encoding, /* encoding */
943 const char *errors /* error handling */
944 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000945#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000946
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000947/* Encodes a Unicode object and returns the result as Python
948 object. */
949
950PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000951 PyObject *unicode, /* Unicode object */
952 const char *encoding, /* encoding */
953 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000954 );
955
Guido van Rossumd8225182000-03-10 22:33:05 +0000956/* Encodes a Unicode object and returns the result as Python string
957 object. */
958
Mark Hammond91a681d2002-08-12 07:21:58 +0000959PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000960 PyObject *unicode, /* Unicode object */
961 const char *encoding, /* encoding */
962 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000963 );
964
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000965/* Encodes a Unicode object and returns the result as Unicode
966 object. */
967
968PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000969 PyObject *unicode, /* Unicode object */
970 const char *encoding, /* encoding */
971 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000972 );
973
974/* Build an encoding map. */
975
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000976PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
977 PyObject* string /* 256 character map */
978 );
979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980/* --- UTF-7 Codecs ------------------------------------------------------- */
981
Mark Hammond91a681d2002-08-12 07:21:58 +0000982PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000983 const char *string, /* UTF-7 encoded string */
984 Py_ssize_t length, /* size of string */
985 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000986 );
987
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000988PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989 const char *string, /* UTF-7 encoded string */
990 Py_ssize_t length, /* size of string */
991 const char *errors, /* error handling */
992 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000993 );
994
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000995#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000996PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000997 const Py_UNICODE *data, /* Unicode char buffer */
998 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
999 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1000 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1001 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001003#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001004
Guido van Rossumd8225182000-03-10 22:33:05 +00001005/* --- UTF-8 Codecs ------------------------------------------------------- */
1006
Mark Hammond91a681d2002-08-12 07:21:58 +00001007PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 const char *string, /* UTF-8 encoded string */
1009 Py_ssize_t length, /* size of string */
1010 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 );
1012
Walter Dörwald69652032004-09-07 20:24:22 +00001013PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 const char *string, /* UTF-8 encoded string */
1015 Py_ssize_t length, /* size of string */
1016 const char *errors, /* error handling */
1017 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001018 );
1019
Mark Hammond91a681d2002-08-12 07:21:58 +00001020PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001022 );
1023
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001024#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001025PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1026 PyObject *unicode,
1027 const char *errors);
1028
Mark Hammond91a681d2002-08-12 07:21:58 +00001029PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 const Py_UNICODE *data, /* Unicode char buffer */
1031 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1032 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001033 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001034#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001035
Walter Dörwald41980ca2007-08-16 21:55:45 +00001036/* --- UTF-32 Codecs ------------------------------------------------------ */
1037
1038/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1039 the corresponding Unicode object.
1040
1041 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001043
1044 If byteorder is non-NULL, the decoder starts decoding using the
1045 given byte order:
1046
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047 *byteorder == -1: little endian
1048 *byteorder == 0: native order
1049 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001050
1051 In native mode, the first four bytes of the stream are checked for a
1052 BOM mark. If found, the BOM mark is analysed, the byte order
1053 adjusted and the BOM skipped. In the other modes, no BOM mark
1054 interpretation is done. After completion, *byteorder is set to the
1055 current byte order at the end of input data.
1056
1057 If byteorder is NULL, the codec starts in native order mode.
1058
1059*/
1060
1061PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 const char *string, /* UTF-32 encoded string */
1063 Py_ssize_t length, /* size of string */
1064 const char *errors, /* error handling */
1065 int *byteorder /* pointer to byteorder to use
1066 0=native;-1=LE,1=BE; updated on
1067 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001068 );
1069
1070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const char *string, /* UTF-32 encoded string */
1072 Py_ssize_t length, /* size of string */
1073 const char *errors, /* error handling */
1074 int *byteorder, /* pointer to byteorder to use
1075 0=native;-1=LE,1=BE; updated on
1076 exit */
1077 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001078 );
1079
1080/* Returns a Python string using the UTF-32 encoding in native byte
1081 order. The string always starts with a BOM mark. */
1082
1083PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001084 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001085 );
1086
1087/* Returns a Python string object holding the UTF-32 encoded value of
1088 the Unicode data.
1089
1090 If byteorder is not 0, output is written according to the following
1091 byte order:
1092
1093 byteorder == -1: little endian
1094 byteorder == 0: native byte order (writes a BOM mark)
1095 byteorder == 1: big endian
1096
1097 If byteorder is 0, the output string will always start with the
1098 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1099 prepended.
1100
1101*/
1102
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001103#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001104PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 const Py_UNICODE *data, /* Unicode char buffer */
1106 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1107 const char *errors, /* error handling */
1108 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001109 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001110#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001111
Guido van Rossumd8225182000-03-10 22:33:05 +00001112/* --- UTF-16 Codecs ------------------------------------------------------ */
1113
Guido van Rossum9e896b32000-04-05 20:11:21 +00001114/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001115 the corresponding Unicode object.
1116
1117 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001119
1120 If byteorder is non-NULL, the decoder starts decoding using the
1121 given byte order:
1122
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 *byteorder == -1: little endian
1124 *byteorder == 0: native order
1125 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001126
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001127 In native mode, the first two bytes of the stream are checked for a
1128 BOM mark. If found, the BOM mark is analysed, the byte order
1129 adjusted and the BOM skipped. In the other modes, no BOM mark
1130 interpretation is done. After completion, *byteorder is set to the
1131 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001132
1133 If byteorder is NULL, the codec starts in native order mode.
1134
1135*/
1136
Mark Hammond91a681d2002-08-12 07:21:58 +00001137PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001138 const char *string, /* UTF-16 encoded string */
1139 Py_ssize_t length, /* size of string */
1140 const char *errors, /* error handling */
1141 int *byteorder /* pointer to byteorder to use
1142 0=native;-1=LE,1=BE; updated on
1143 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001144 );
1145
Walter Dörwald69652032004-09-07 20:24:22 +00001146PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 const char *string, /* UTF-16 encoded string */
1148 Py_ssize_t length, /* size of string */
1149 const char *errors, /* error handling */
1150 int *byteorder, /* pointer to byteorder to use
1151 0=native;-1=LE,1=BE; updated on
1152 exit */
1153 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001154 );
1155
Guido van Rossumd8225182000-03-10 22:33:05 +00001156/* Returns a Python string using the UTF-16 encoding in native byte
1157 order. The string always starts with a BOM mark. */
1158
Mark Hammond91a681d2002-08-12 07:21:58 +00001159PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001161 );
1162
1163/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001164 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001165
1166 If byteorder is not 0, output is written according to the following
1167 byte order:
1168
1169 byteorder == -1: little endian
1170 byteorder == 0: native byte order (writes a BOM mark)
1171 byteorder == 1: big endian
1172
1173 If byteorder is 0, the output string will always start with the
1174 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1175 prepended.
1176
1177 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1178 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001179 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001180
1181*/
1182
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001183#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001184PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 const Py_UNICODE *data, /* Unicode char buffer */
1186 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1187 const char *errors, /* error handling */
1188 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001189 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001190#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001191
1192/* --- Unicode-Escape Codecs ---------------------------------------------- */
1193
Mark Hammond91a681d2002-08-12 07:21:58 +00001194PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 const char *string, /* Unicode-Escape encoded string */
1196 Py_ssize_t length, /* size of string */
1197 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001198 );
1199
Mark Hammond91a681d2002-08-12 07:21:58 +00001200PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001202 );
1203
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001204#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001205PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 const Py_UNICODE *data, /* Unicode char buffer */
1207 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001208 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001209#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001210
1211/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1212
Mark Hammond91a681d2002-08-12 07:21:58 +00001213PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 const char *string, /* Raw-Unicode-Escape encoded string */
1215 Py_ssize_t length, /* size of string */
1216 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001217 );
1218
Mark Hammond91a681d2002-08-12 07:21:58 +00001219PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001221 );
1222
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001223#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001224PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 const Py_UNICODE *data, /* Unicode char buffer */
1226 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001227 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001228#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001229
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001230/* --- Unicode Internal Codec ---------------------------------------------
1231
1232 Only for internal use in _codecsmodule.c */
1233
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001234#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001235PyObject *_PyUnicode_DecodeUnicodeInternal(
1236 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001237 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001238 const char *errors
1239 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001240#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001241
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001243
1244 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1245
1246*/
1247
Mark Hammond91a681d2002-08-12 07:21:58 +00001248PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 const char *string, /* Latin-1 encoded string */
1250 Py_ssize_t length, /* size of string */
1251 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001252 );
1253
Mark Hammond91a681d2002-08-12 07:21:58 +00001254PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001256 );
1257
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001258#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1260 PyObject* unicode,
1261 const char* errors);
1262
Mark Hammond91a681d2002-08-12 07:21:58 +00001263PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 const Py_UNICODE *data, /* Unicode char buffer */
1265 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1266 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001267 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001268#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001269
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001271
1272 Only 7-bit ASCII data is excepted. All other codes generate errors.
1273
1274*/
1275
Mark Hammond91a681d2002-08-12 07:21:58 +00001276PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 const char *string, /* ASCII encoded string */
1278 Py_ssize_t length, /* size of string */
1279 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001280 );
1281
Mark Hammond91a681d2002-08-12 07:21:58 +00001282PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 );
1285
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001286#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1288 PyObject* unicode,
1289 const char* errors);
1290
Mark Hammond91a681d2002-08-12 07:21:58 +00001291PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 const Py_UNICODE *data, /* Unicode char buffer */
1293 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1294 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001296#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001297
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001299
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001301
1302 Decoding mappings must map single string characters to single
1303 Unicode characters, integers (which are then interpreted as Unicode
1304 ordinals) or None (meaning "undefined mapping" and causing an
1305 error).
1306
1307 Encoding mappings must map single Unicode characters to single
1308 string characters, integers (which are then interpreted as Latin-1
1309 ordinals) or None (meaning "undefined mapping" and causing an
1310 error).
1311
1312 If a character lookup fails with a LookupError, the character is
1313 copied as-is meaning that its ordinal value will be interpreted as
1314 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1315 to contain those mappings which map characters to different code
1316 points.
1317
1318*/
1319
Mark Hammond91a681d2002-08-12 07:21:58 +00001320PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 const char *string, /* Encoded string */
1322 Py_ssize_t length, /* size of string */
1323 PyObject *mapping, /* character mapping
1324 (char ordinal -> unicode ordinal) */
1325 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
1327
Mark Hammond91a681d2002-08-12 07:21:58 +00001328PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 PyObject *unicode, /* Unicode object */
1330 PyObject *mapping /* character mapping
1331 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001332 );
1333
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001334#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001335PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 const Py_UNICODE *data, /* Unicode char buffer */
1337 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1338 PyObject *mapping, /* character mapping
1339 (unicode ordinal -> char ordinal) */
1340 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001341 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001343
1344/* Translate a Py_UNICODE buffer of the given length by applying a
1345 character mapping table to it and return the resulting Unicode
1346 object.
1347
1348 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001350
1351 Mapping tables may be dictionaries or sequences. Unmapped character
1352 ordinals (ones which cause a LookupError) are left untouched and
1353 are copied as-is.
1354
1355*/
1356
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001357#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001358PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 const Py_UNICODE *data, /* Unicode char buffer */
1360 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1361 PyObject *table, /* Translate table */
1362 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001363 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001364#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001365
Victor Stinner99b95382011-07-04 14:23:54 +02001366#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001367
Guido van Rossumefec1152000-03-28 02:01:15 +00001368/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001369
Mark Hammond91a681d2002-08-12 07:21:58 +00001370PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001371 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001372 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001373 const char *errors /* error handling */
1374 );
1375
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001376PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1377 const char *string, /* MBCS encoded string */
1378 Py_ssize_t length, /* size of string */
1379 const char *errors, /* error handling */
1380 Py_ssize_t *consumed /* bytes consumed */
1381 );
1382
Mark Hammond91a681d2002-08-12 07:21:58 +00001383PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001384 PyObject *unicode /* Unicode object */
1385 );
1386
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001387#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001388PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001389 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001390 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001391 const char *errors /* error handling */
1392 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001393#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001394
Victor Stinner99b95382011-07-04 14:23:54 +02001395#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001396
Guido van Rossum9e896b32000-04-05 20:11:21 +00001397/* --- Decimal Encoder ---------------------------------------------------- */
1398
1399/* Takes a Unicode string holding a decimal value and writes it into
1400 an output buffer using standard ASCII digit codes.
1401
1402 The output buffer has to provide at least length+1 bytes of storage
1403 area. The output string is 0-terminated.
1404
1405 The encoder converts whitespace to ' ', decimal characters to their
1406 corresponding ASCII digit and all other Latin-1 characters except
1407 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1408 are treated as errors. This includes embedded NULL bytes.
1409
1410 Error handling is defined by the errors argument:
1411
1412 NULL or "strict": raise a ValueError
1413 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001415 "replace": replaces illegal characters with '?'
1416
1417 Returns 0 on success, -1 on failure.
1418
1419*/
1420
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001421#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001422PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 Py_UNICODE *s, /* Unicode buffer */
1424 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1425 char *output, /* Output buffer; must have size >= length */
1426 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001427 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001428#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001429
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001430/* Transforms code points that have decimal digit property to the
1431 corresponding ASCII digit code points.
1432
1433 Returns a new Unicode string on success, NULL on failure.
1434*/
1435
Georg Brandlb5503082010-12-05 11:40:48 +00001436#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001437PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1438 Py_UNICODE *s, /* Unicode buffer */
1439 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1440 );
Georg Brandlb5503082010-12-05 11:40:48 +00001441#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1444 as argument instead of a raw buffer and length. This function additionally
1445 transforms spaces to ASCII because this is what the callers in longobject,
1446 floatobject, and complexobject did anyways. */
1447
1448#ifndef Py_LIMITED_API
1449PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1450 PyObject *unicode /* Unicode object */
1451 );
1452#endif
1453
Martin v. Löwis011e8422009-05-05 04:43:17 +00001454/* --- File system encoding ---------------------------------------------- */
1455
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001456/* ParseTuple converter: encode str objects to bytes using
1457 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001458
1459PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1460
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001461/* ParseTuple converter: decode bytes objects to unicode using
1462 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1463
1464PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1465
Victor Stinner77c38622010-05-14 15:58:55 +00001466/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1467 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001468
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001469 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1470 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001471
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001472 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001473*/
1474
1475PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1476 const char *s /* encoded string */
1477 );
1478
Victor Stinner77c38622010-05-14 15:58:55 +00001479/* Decode a string using Py_FileSystemDefaultEncoding
1480 and the "surrogateescape" error handler.
1481
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001482 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1483 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001484*/
1485
Martin v. Löwis011e8422009-05-05 04:43:17 +00001486PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1487 const char *s, /* encoded string */
1488 Py_ssize_t size /* size */
1489 );
1490
Victor Stinnerae6265f2010-05-15 16:27:27 +00001491/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001492 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001493
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001494 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1495 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001496*/
1497
1498PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1499 PyObject *unicode
1500 );
1501
Guido van Rossumd8225182000-03-10 22:33:05 +00001502/* --- Methods & Slots ----------------------------------------------------
1503
1504 These are capable of handling Unicode objects and strings on input
1505 (we refer to them as strings in the descriptions) and return
1506 Unicode objects or integers as apporpriate. */
1507
1508/* Concat two strings giving a new Unicode string. */
1509
Mark Hammond91a681d2002-08-12 07:21:58 +00001510PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001511 PyObject *left, /* Left string */
1512 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001513 );
1514
Walter Dörwald1ab83302007-05-18 17:15:44 +00001515/* Concat two strings and put the result in *pleft
1516 (sets *pleft to NULL on error) */
1517
1518PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001519 PyObject **pleft, /* Pointer to left string */
1520 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001521 );
1522
1523/* Concat two strings, put the result in *pleft and drop the right object
1524 (sets *pleft to NULL on error) */
1525
1526PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 PyObject **pleft, /* Pointer to left string */
1528 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001529 );
1530
Guido van Rossumd8225182000-03-10 22:33:05 +00001531/* Split a string giving a list of Unicode strings.
1532
1533 If sep is NULL, splitting will be done at all whitespace
1534 substrings. Otherwise, splits occur at the given separator.
1535
1536 At most maxsplit splits will be done. If negative, no limit is set.
1537
1538 Separators are not included in the resulting list.
1539
1540*/
1541
Mark Hammond91a681d2002-08-12 07:21:58 +00001542PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 PyObject *s, /* String to split */
1544 PyObject *sep, /* String separator */
1545 Py_ssize_t maxsplit /* Maxsplit count */
1546 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001547
1548/* Dito, but split at line breaks.
1549
1550 CRLF is considered to be one line break. Line breaks are not
1551 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001552
Mark Hammond91a681d2002-08-12 07:21:58 +00001553PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554 PyObject *s, /* String to split */
1555 int keepends /* If true, line end markers are included */
1556 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001557
Thomas Wouters477c8d52006-05-27 19:21:47 +00001558/* Partition a string using a given separator. */
1559
1560PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 PyObject *s, /* String to partition */
1562 PyObject *sep /* String separator */
1563 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001564
1565/* Partition a string using a given separator, searching from the end of the
1566 string. */
1567
1568PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001569 PyObject *s, /* String to partition */
1570 PyObject *sep /* String separator */
1571 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001572
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001573/* Split a string giving a list of Unicode strings.
1574
1575 If sep is NULL, splitting will be done at all whitespace
1576 substrings. Otherwise, splits occur at the given separator.
1577
1578 At most maxsplit splits will be done. But unlike PyUnicode_Split
1579 PyUnicode_RSplit splits from the end of the string. If negative,
1580 no limit is set.
1581
1582 Separators are not included in the resulting list.
1583
1584*/
1585
1586PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 PyObject *s, /* String to split */
1588 PyObject *sep, /* String separator */
1589 Py_ssize_t maxsplit /* Maxsplit count */
1590 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001591
Guido van Rossumd8225182000-03-10 22:33:05 +00001592/* Translate a string by applying a character mapping table to it and
1593 return the resulting Unicode object.
1594
1595 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001596 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001597
1598 Mapping tables may be dictionaries or sequences. Unmapped character
1599 ordinals (ones which cause a LookupError) are left untouched and
1600 are copied as-is.
1601
1602*/
1603
Mark Hammond91a681d2002-08-12 07:21:58 +00001604PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001605 PyObject *str, /* String */
1606 PyObject *table, /* Translate table */
1607 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001608 );
1609
1610/* Join a sequence of strings using the given separator and return
1611 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612
Mark Hammond91a681d2002-08-12 07:21:58 +00001613PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001614 PyObject *separator, /* Separator string */
1615 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001616 );
1617
1618/* Return 1 if substr matches str[start:end] at the given tail end, 0
1619 otherwise. */
1620
Martin v. Löwis18e16552006-02-15 17:27:45 +00001621PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 PyObject *str, /* String */
1623 PyObject *substr, /* Prefix or Suffix string */
1624 Py_ssize_t start, /* Start index */
1625 Py_ssize_t end, /* Stop index */
1626 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001627 );
1628
1629/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001630 given search direction or -1 if not found. -2 is returned in case
1631 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001632
Martin v. Löwis18e16552006-02-15 17:27:45 +00001633PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001634 PyObject *str, /* String */
1635 PyObject *substr, /* Substring to find */
1636 Py_ssize_t start, /* Start index */
1637 Py_ssize_t end, /* Stop index */
1638 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001639 );
1640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641/* Like PyUnicode_Find, but search for single character only. */
1642PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1643 PyObject *str,
1644 Py_UCS4 ch,
1645 Py_ssize_t start,
1646 Py_ssize_t end,
1647 int direction
1648 );
1649
Barry Warsaw51ac5802000-03-20 16:36:48 +00001650/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001651
Martin v. Löwis18e16552006-02-15 17:27:45 +00001652PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 PyObject *str, /* String */
1654 PyObject *substr, /* Substring to count */
1655 Py_ssize_t start, /* Start index */
1656 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001657 );
1658
Barry Warsaw51ac5802000-03-20 16:36:48 +00001659/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001660 and return the resulting Unicode object. */
1661
Mark Hammond91a681d2002-08-12 07:21:58 +00001662PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 PyObject *str, /* String */
1664 PyObject *substr, /* Substring to find */
1665 PyObject *replstr, /* Substring to replace */
1666 Py_ssize_t maxcount /* Max. number of replacements to apply;
1667 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001668 );
1669
1670/* Compare two strings and return -1, 0, 1 for less than, equal,
1671 greater than resp. */
1672
Mark Hammond91a681d2002-08-12 07:21:58 +00001673PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 PyObject *left, /* Left string */
1675 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001676 );
1677
Martin v. Löwis5b222132007-06-10 09:51:05 +00001678PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1679 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001680 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001681 );
1682
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001683/* Rich compare two strings and return one of the following:
1684
1685 - NULL in case an exception was raised
1686 - Py_True or Py_False for successfuly comparisons
1687 - Py_NotImplemented in case the type combination is unknown
1688
1689 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1690 case the conversion of the arguments to Unicode fails with a
1691 UnicodeDecodeError.
1692
1693 Possible values for op:
1694
1695 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1696
1697*/
1698
1699PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 PyObject *left, /* Left string */
1701 PyObject *right, /* Right string */
1702 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001703 );
1704
Thomas Wouters7e474022000-07-16 12:04:32 +00001705/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001706 the resulting Unicode string. */
1707
Mark Hammond91a681d2002-08-12 07:21:58 +00001708PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001709 PyObject *format, /* Format string */
1710 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001711 );
1712
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001713/* Checks whether element is contained in container and return 1/0
1714 accordingly.
1715
1716 element has to coerce to an one element Unicode string. -1 is
1717 returned in case of an error. */
1718
Mark Hammond91a681d2002-08-12 07:21:58 +00001719PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 PyObject *container, /* Container string */
1721 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001722 );
1723
Martin v. Löwis47383402007-08-15 07:32:56 +00001724/* Checks whether argument is a valid identifier. */
1725
1726PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1727
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001728#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001729/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001730PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001731 PyUnicodeObject *self,
1732 int striptype,
1733 PyObject *sepobj
1734 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001735#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001736
Eric Smith5807c412008-05-11 21:00:57 +00001737/* Using the current locale, insert the thousands grouping
1738 into the string pointed to by buffer. For the argument descriptions,
1739 see Objects/stringlib/localeutil.h */
1740
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001741#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001742PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1743 Py_ssize_t n_buffer,
1744 Py_UNICODE *digits,
1745 Py_ssize_t n_digits,
1746 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001747#endif
Eric Smith5807c412008-05-11 21:00:57 +00001748
Eric Smitha3b1ac82009-04-03 14:45:06 +00001749/* Using explicit passed-in values, insert the thousands grouping
1750 into the string pointed to by buffer. For the argument descriptions,
1751 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001752#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1754 int kind,
1755 void *buffer,
1756 Py_ssize_t n_buffer,
1757 void *digits,
1758 Py_ssize_t n_digits,
1759 Py_ssize_t min_width,
1760 const char *grouping,
1761 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001762#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001763/* === Characters Type APIs =============================================== */
1764
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001765/* Helper array used by Py_UNICODE_ISSPACE(). */
1766
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001767#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001768PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1769
Guido van Rossumd8225182000-03-10 22:33:05 +00001770/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001771 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001772
1773 These APIs are implemented in Objects/unicodectype.c.
1774
1775*/
1776
Mark Hammond91a681d2002-08-12 07:21:58 +00001777PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001778 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001779 );
1780
Mark Hammond91a681d2002-08-12 07:21:58 +00001781PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001782 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001783 );
1784
Mark Hammond91a681d2002-08-12 07:21:58 +00001785PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001786 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001787 );
1788
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001789PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001790 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001791 );
1792
1793PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001794 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001795 );
1796
Mark Hammond91a681d2002-08-12 07:21:58 +00001797PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001798 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001799 );
1800
Mark Hammond91a681d2002-08-12 07:21:58 +00001801PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001802 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001803 );
1804
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001805PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1806 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001807 );
1808
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001809PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1810 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001811 );
1812
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001813PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1814 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001815 );
1816
Mark Hammond91a681d2002-08-12 07:21:58 +00001817PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001818 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001819 );
1820
Mark Hammond91a681d2002-08-12 07:21:58 +00001821PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001822 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001823 );
1824
Mark Hammond91a681d2002-08-12 07:21:58 +00001825PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001826 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001827 );
1828
Mark Hammond91a681d2002-08-12 07:21:58 +00001829PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001830 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001831 );
1832
Mark Hammond91a681d2002-08-12 07:21:58 +00001833PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001834 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001835 );
1836
Mark Hammond91a681d2002-08-12 07:21:58 +00001837PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001838 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001839 );
1840
Georg Brandl559e5d72008-06-11 18:37:52 +00001841PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001842 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001843 );
1844
Mark Hammond91a681d2002-08-12 07:21:58 +00001845PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001846 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001847 );
1848
Victor Stinneref8d95c2010-08-16 22:03:11 +00001849PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1850 const Py_UNICODE *u
1851 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001852
1853PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001854 Py_UNICODE *s1,
1855 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001856
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001857PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1858 Py_UNICODE *s1, const Py_UNICODE *s2);
1859
Martin v. Löwis5b222132007-06-10 09:51:05 +00001860PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001861 Py_UNICODE *s1,
1862 const Py_UNICODE *s2,
1863 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001864
1865PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001866 const Py_UNICODE *s1,
1867 const Py_UNICODE *s2
1868 );
1869
1870PyAPI_FUNC(int) Py_UNICODE_strncmp(
1871 const Py_UNICODE *s1,
1872 const Py_UNICODE *s2,
1873 size_t n
1874 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001875
1876PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001877 const Py_UNICODE *s,
1878 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001879 );
1880
Victor Stinner331ea922010-08-10 16:37:20 +00001881PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001882 const Py_UNICODE *s,
1883 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001884 );
1885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886PyAPI_FUNC(size_t) Py_UCS4_strlen(
1887 const Py_UCS4 *u
1888 );
1889
1890PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1891 Py_UCS4 *s1,
1892 const Py_UCS4 *s2);
1893
1894PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1895 Py_UCS4 *s1, const Py_UCS4 *s2);
1896
1897PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1898 Py_UCS4 *s1,
1899 const Py_UCS4 *s2,
1900 size_t n);
1901
1902PyAPI_FUNC(int) Py_UCS4_strcmp(
1903 const Py_UCS4 *s1,
1904 const Py_UCS4 *s2
1905 );
1906
1907PyAPI_FUNC(int) Py_UCS4_strncmp(
1908 const Py_UCS4 *s1,
1909 const Py_UCS4 *s2,
1910 size_t n
1911 );
1912
1913PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1914 const Py_UCS4 *s,
1915 Py_UCS4 c
1916 );
1917
1918PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1919 const Py_UCS4 *s,
1920 Py_UCS4 c
1921 );
1922
Victor Stinner71133ff2010-09-01 23:43:53 +00001923/* Create a copy of a unicode string ending with a nul character. Return NULL
1924 and raise a MemoryError exception on memory allocation failure, otherwise
1925 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1926
Victor Stinner46408602010-09-03 16:18:00 +00001927PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001928 PyObject *unicode
1929 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001930#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001931
Guido van Rossumd8225182000-03-10 22:33:05 +00001932#ifdef __cplusplus
1933}
1934#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001935#endif /* !Py_UNICODEOBJECT_H */