blob: ba73e562acf67e941aa38927a0c870e7d7dff87c [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner910337b2011-10-03 03:20:16 +0200209 /* Unicode strings can be in 4 states:
210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
218 * utf8 = data
219
220 - compact:
221
222 * structure = PyCompactUnicodeObject
223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224 PyUnicode_4BYTE_KIND
225 * compact = 1
226 * ready = 1
227 * (ascii = 0)
228
229 - string created by the legacy API (not ready):
230
231 * structure = PyUnicodeObject
232 * kind = PyUnicode_WCHAR_KIND
233 * compact = 0
234 * ready = 0
235 * wstr is not NULL
236 * data.any is NULL
237 * utf8 is NULL
238 * interned = SSTATE_NOT_INTERNED
239 * (ascii = 0)
240
241 - string created by the legacy API, ready:
242
243 * structure = PyUnicodeObject structure
244 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
245 PyUnicode_4BYTE_KIND
246 * compact = 0
247 * ready = 1
248 * data.any is not NULL
249 * (ascii = 0)
250
251 String created by the legacy API becomes ready when calling
252 PyUnicode_READY().
253
254 See also _PyUnicode_CheckConsistency(). */
Guido van Rossumd8225182000-03-10 22:33:05 +0000255 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200256 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000257 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200258 struct {
259 /*
260 SSTATE_NOT_INTERNED (0)
261 SSTATE_INTERNED_MORTAL (1)
262 SSTATE_INTERNED_IMMORTAL (2)
263
264 If interned != SSTATE_NOT_INTERNED, the two references from the
265 dictionary to this object are *not* counted in ob_refcnt.
266 */
267 unsigned int interned:2;
268 /* Character size:
269
270 PyUnicode_WCHAR_KIND (0): wchar_t*
271 PyUnicode_1BYTE_KIND (1): Py_UCS1*
272 PyUnicode_2BYTE_KIND (2): Py_UCS2*
273 PyUnicode_4BYTE_KIND (3): Py_UCS4*
274 */
275 unsigned int kind:2;
276 /* Compact is with respect to the allocation scheme. Compact unicode
277 objects only require one memory block while non-compact objects use
278 one block for the PyUnicodeObject struct and another for its data
279 buffer. */
280 unsigned int compact:1;
281 /* Compact objects which are ASCII-only also have the state.compact
282 flag set, and use the PyASCIIObject struct. */
283 unsigned int ascii:1;
284 /* The ready flag indicates whether the object layout is initialized
285 completely. This means that this is either a compact object, or
286 the data pointer is filled out. The bit is redundant, and helps
287 to minimize the test in PyUnicode_IS_READY(). */
288 unsigned int ready:1;
289 } state;
290 wchar_t *wstr; /* wchar_t representation (null-terminated) */
291} PyASCIIObject;
292
293/* Non-ASCII strings allocated through PyUnicode_New use the
294 PyCompactUnicodeOject structure. state.compact is set, and the data
295 immediately follow the structure. */
296typedef struct {
297 PyASCIIObject _base;
298 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
299 * terminating \0. */
300 char *utf8; /* UTF-8 representation (null-terminated) */
301 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
302 * surrogates count as two code points. */
303} PyCompactUnicodeObject;
304
305/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
306 PyUnicodeObject structure. The actual string data is initially in the wstr
307 block, and copied into the data block using PyUnicode_Ready. */
308typedef struct {
309 PyCompactUnicodeObject _base;
310 union {
311 void *any;
312 Py_UCS1 *latin1;
313 Py_UCS2 *ucs2;
314 Py_UCS4 *ucs4;
315 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000316} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000317#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000318
Mark Hammond91a681d2002-08-12 07:21:58 +0000319PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000320PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000321
Thomas Wouters27d517b2007-02-25 20:39:11 +0000322#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000323 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
324#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000325
326/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000327#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200328
329#define PyUnicode_WSTR_LENGTH(op) \
330 (((PyASCIIObject*)op)->state.ascii ? \
331 ((PyASCIIObject*)op)->length : \
332 ((PyCompactUnicodeObject*)op)->wstr_length)
333
334/* Returns the deprecated Py_UNICODE representation's size in code units
335 (this includes surrogate pairs as 2 units).
336 If the Py_UNICODE representation is not available, it will be computed
337 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
338
Guido van Rossumd8225182000-03-10 22:33:05 +0000339#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 (assert(PyUnicode_Check(op)), \
341 (((PyASCIIObject *)(op))->wstr) ? \
342 PyUnicode_WSTR_LENGTH(op) : \
343 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
344 PyUnicode_WSTR_LENGTH(op)))
345
Guido van Rossumd8225182000-03-10 22:33:05 +0000346#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
348
349/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
350 representation on demand. Using this macro is very inefficient now,
351 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
352 use PyUnicode_WRITE() and PyUnicode_READ(). */
353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 (assert(PyUnicode_Check(op)), \
356 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
357 PyUnicode_AsUnicode((PyObject *)(op)))
358
Guido van Rossumd8225182000-03-10 22:33:05 +0000359#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200360 ((const char *)(PyUnicode_AS_UNICODE(op)))
361
362
363/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
364
365/* Values for PyUnicodeObject.state: */
366
367/* Interning state. */
368#define SSTATE_NOT_INTERNED 0
369#define SSTATE_INTERNED_MORTAL 1
370#define SSTATE_INTERNED_IMMORTAL 2
371
372#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
373
374/* String contains only wstr byte characters. This is only possible
375 when the string was created with a legacy API and PyUnicode_Ready()
376 has not been called yet. */
377#define PyUnicode_WCHAR_KIND 0
378
379/* Return values of the PyUnicode_KIND() macro: */
380
381#define PyUnicode_1BYTE_KIND 1
382#define PyUnicode_2BYTE_KIND 2
383#define PyUnicode_4BYTE_KIND 3
384
385
386/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200387 this can be 1, 2 or 4.
388
389 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200390#define PyUnicode_CHARACTER_SIZE(op) \
391 (1 << (PyUnicode_KIND(op) - 1))
392
393/* Return pointers to the canonical representation casted as unsigned char,
394 Py_UCS2, or Py_UCS4 for direct character access.
395 No checks are performed, use PyUnicode_CHARACTER_SIZE or
396 PyUnicode_KIND() before to ensure these will work correctly. */
397
398#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
399#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
400#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
401
402/* Return true if the string is compact or 0 if not.
403 No type checks or Ready calls are performed. */
404#define PyUnicode_IS_COMPACT(op) \
405 (((PyASCIIObject*)(op))->state.compact)
406
Victor Stinner157f83f2011-09-28 21:41:31 +0200407/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408#define PyUnicode_KIND(op) \
409 (assert(PyUnicode_Check(op)), \
410 assert(PyUnicode_IS_READY(op)), \
411 ((PyASCIIObject *)(op))->state.kind)
412
Victor Stinner157f83f2011-09-28 21:41:31 +0200413/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#define _PyUnicode_COMPACT_DATA(op) \
415 (PyUnicode_IS_COMPACT_ASCII(op) ? \
416 ((void*)((PyASCIIObject*)(op) + 1)) : \
417 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
418
419#define _PyUnicode_NONCOMPACT_DATA(op) \
420 (assert(((PyUnicodeObject*)(op))->data.any), \
421 ((((PyUnicodeObject *)(op))->data.any)))
422
423#define PyUnicode_DATA(op) \
424 (assert(PyUnicode_Check(op)), \
425 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
426 _PyUnicode_NONCOMPACT_DATA(op))
427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200429 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430
Victor Stinner4584a5b2011-10-01 02:39:37 +0200431 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
433
434/* In the access macros below, "kind" may be evaluated more than once.
435 All other macro parameters are evaluated exactly once, so it is safe
436 to put side effects into them (such as increasing the index). */
437
438/* Write into the canonical representation, this macro does not do any sanity
439 checks and is intended for usage in loops. The caller should cache the
440 kind and data pointers optained form other macro calls.
441 index is the index in the string (starts at 0) and value is the new
442 code point value which shoule be written to that location. */
443#define PyUnicode_WRITE(kind, data, index, value) \
444 do { \
445 switch ((kind)) { \
446 case PyUnicode_1BYTE_KIND: { \
447 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
448 break; \
449 } \
450 case PyUnicode_2BYTE_KIND: { \
451 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
452 break; \
453 } \
454 default: { \
455 assert((kind) == PyUnicode_4BYTE_KIND); \
456 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
457 } \
458 } \
459 } while (0)
460
461/* Read a code point form the string's canonical representation. No checks
462 or ready calls are performed. */
463#define PyUnicode_READ(kind, data, index) \
464 ((Py_UCS4) \
465 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200466 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200467 ((kind) == PyUnicode_2BYTE_KIND ? \
468 ((const Py_UCS2 *)(data))[(index)] : \
469 ((const Py_UCS4 *)(data))[(index)] \
470 ) \
471 ))
472
473/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
474 calls PyUnicode_KIND() and might call it twice. For single reads, use
475 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
476 cache kind and use PyUnicode_READ instead. */
477#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200478 (assert(PyUnicode_Check(unicode)), \
479 assert(PyUnicode_IS_READY(unicode)), \
480 (Py_UCS4) \
481 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
482 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
483 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
484 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
485 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
486 ) \
487 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200488
489/* Returns the length of the unicode string. The caller has to make sure that
490 the string has it's canonical representation set before calling
491 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
492#define PyUnicode_GET_LENGTH(op) \
493 (assert(PyUnicode_Check(op)), \
494 assert(PyUnicode_IS_READY(op)), \
495 ((PyASCIIObject *)(op))->length)
496
497
498/* Fast check to determine whether an object is ready. Equivalent to
499 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
500
501#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
502
503/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
504 case. If the canonical representation is not yet set, it will still call
505 PyUnicode_Ready().
506 Returns 0 on success and -1 on errors. */
507#define PyUnicode_READY(op) \
508 (assert(PyUnicode_Check(op)), \
509 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200510 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200512/* Return a maximum character value which is suitable for creating another
513 string based on op. This is always an approximation but more efficient
514 than interating over the string. */
515#define PyUnicode_MAX_CHAR_VALUE(op) \
516 (assert(PyUnicode_IS_READY(op)), \
517 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
518 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
519 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
520 (0x7fU) : (0xffU) \
521 ) : \
522 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
523 (0xffffU) : (0x10ffffU) \
524 ))))
525
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000526#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000527
528/* --- Constants ---------------------------------------------------------- */
529
530/* This Unicode character will be used as replacement character during
531 decoding if the errors argument is set to "replace". Note: the
532 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
533 Unicode 3.0. */
534
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200535#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000536
537/* === Public API ========================================================= */
538
539/* --- Plain Py_UNICODE --------------------------------------------------- */
540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541/* With PEP 393, this is the recommended way to allocate a new unicode object.
542 This function will allocate the object and its buffer in a single memory
543 block. Objects created using this function are not resizable. */
544#ifndef Py_LIMITED_API
545PyAPI_FUNC(PyObject*) PyUnicode_New(
546 Py_ssize_t size, /* Number of code points in the new string */
547 Py_UCS4 maxchar /* maximum code point value in the string */
548 );
549#endif
550
Victor Stinnerd8f65102011-09-29 19:43:17 +0200551/* Initializes the canonical string representation from a the deprecated
552 wstr/Py_UNICODE representation. This function is used to convert Unicode
553 objects which were created using the old API to the new flexible format
554 introduced with PEP 393.
555
556 Don't call this function directly, use the public PyUnicode_READY() macro
557 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558#ifndef Py_LIMITED_API
559PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200560 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 );
562#endif
563
Victor Stinner034f6cf2011-09-30 02:26:44 +0200564/* Get a copy of a Unicode string. */
565PyAPI_FUNC(PyObject*) PyUnicode_Copy(
566 PyObject *unicode
567 );
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200570 character conversion when necessary and falls back to memcpy if possible.
571
Victor Stinnera0702ab2011-09-29 14:14:38 +0200572 Fail if to is too small (smaller than how_many or smaller than
573 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
574 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200575
576 Return the number of written character, or return -1 and raise an exception
577 on error.
578
579 Pseudo-code:
580
581 how_many = min(how_many, len(from) - from_start)
582 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
583 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200584
585 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200586 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200587#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200588PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589 PyObject *to,
590 Py_ssize_t to_start,
591 PyObject *from,
592 Py_ssize_t from_start,
593 Py_ssize_t how_many
594 );
595#endif
596
Guido van Rossumd8225182000-03-10 22:33:05 +0000597/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000599
600 u may be NULL which causes the contents to be undefined. It is the
601 user's responsibility to fill in the needed data afterwards. Note
602 that modifying the Unicode object contents after construction is
603 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000604
605 The buffer is copied into the new object. */
606
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000607#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000608PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000609 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000610 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000611 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000612#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000613
Georg Brandl952867a2010-06-27 10:17:12 +0000614/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000616 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000617 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000618 );
619
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000620/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000622PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000623 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000624 );
625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626#ifndef Py_LIMITED_API
627PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
628 int kind,
629 const void *buffer,
630 Py_ssize_t size);
631#endif
632
633PyAPI_FUNC(PyObject*) PyUnicode_Substring(
634 PyObject *str,
635 Py_ssize_t start,
636 Py_ssize_t end);
637
638/* Copy the string into a UCS4 buffer including the null character is copy_null
639 is set. Return NULL and raise an exception on error. Raise a ValueError if
640 the buffer is smaller than the string. Return buffer on success.
641
642 buflen is the length of the buffer in (Py_UCS4) characters. */
643PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
644 PyObject *unicode,
645 Py_UCS4* buffer,
646 Py_ssize_t buflen,
647 int copy_null);
648
649/* Copy the string into a UCS4 buffer. A new buffer is allocated using
650 * PyMem_Malloc; if this fails, NULL is returned with a memory error
651 exception set. */
652PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
653
Guido van Rossumd8225182000-03-10 22:33:05 +0000654/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 Py_UNICODE buffer.
656 If the wchar_t/Py_UNICODE representation is not yet available, this
657 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000658
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000659#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000660PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000662 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000663#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665/* Return a read-only pointer to the Unicode object's internal
666 Py_UNICODE buffer and save the length at size.
667 If the wchar_t/Py_UNICODE representation is not yet available, this
668 function will calculate it. */
669
670#ifndef Py_LIMITED_API
671PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
672 PyObject *unicode, /* Unicode object */
673 Py_ssize_t *size /* location where to save the length */
674 );
675#endif
676
Guido van Rossumd8225182000-03-10 22:33:05 +0000677/* Get the length of the Unicode object. */
678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
680 PyObject *unicode
681);
682
Victor Stinner157f83f2011-09-28 21:41:31 +0200683/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 string representation. */
685
Martin v. Löwis18e16552006-02-15 17:27:45 +0000686PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000688 );
689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690/* Read a character from the string. */
691
692PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
693 PyObject *unicode,
694 Py_ssize_t index
695 );
696
697/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200698 PyUnicode_New, must not be shared, and must not have been hashed yet.
699
700 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701
702PyAPI_FUNC(int) PyUnicode_WriteChar(
703 PyObject *unicode,
704 Py_ssize_t index,
705 Py_UCS4 character
706 );
707
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000708#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000709/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000710PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000711#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000712
Guido van Rossum52c23592000-04-10 13:41:41 +0000713/* Resize an already allocated Unicode object to the new size length.
714
715 *unicode is modified to point to the new (resized) object and 0
716 returned on success.
717
718 This API may only be called by the function which also called the
719 Unicode constructor. The refcount on the object must be 1. Otherwise,
720 an error is returned.
721
722 Error handling is implemented as follows: an exception is set, -1
723 is returned and *unicode left untouched.
724
725*/
726
Mark Hammond91a681d2002-08-12 07:21:58 +0000727PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 PyObject **unicode, /* Pointer to the Unicode object */
729 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000730 );
731
Guido van Rossumd8225182000-03-10 22:33:05 +0000732/* Coerce obj to an Unicode object and return a reference with
733 *incremented* refcount.
734
735 Coercion is done in the following way:
736
Georg Brandl952867a2010-06-27 10:17:12 +0000737 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000738 under the assumptions that they contain data using the UTF-8
739 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000740
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000741 2. All other objects (including Unicode objects) raise an
742 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000743
744 The API returns NULL in case of an error. The caller is responsible
745 for decref'ing the returned objects.
746
747*/
748
Mark Hammond91a681d2002-08-12 07:21:58 +0000749PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000751 const char *encoding, /* encoding */
752 const char *errors /* error handling */
753 );
754
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000755/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000756 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000758 Unicode objects are passed back as-is (subclasses are converted to
759 true Unicode objects), all other objects are delegated to
760 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000761 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000762
763 The API returns NULL in case of an error. The caller is responsible
764 for decref'ing the returned objects.
765
766*/
767
Mark Hammond91a681d2002-08-12 07:21:58 +0000768PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000770 );
771
Victor Stinner1205f272010-09-11 00:54:47 +0000772PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
773 const char *format, /* ASCII-encoded string */
774 va_list vargs
775 );
776PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
777 const char *format, /* ASCII-encoded string */
778 ...
779 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000780
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000781#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000782/* Format the object based on the format_spec, as defined in PEP 3101
783 (Advanced String Formatting). */
784PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785 PyObject *format_spec,
786 Py_ssize_t start,
787 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000788#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000789
Walter Dörwald16807132007-05-25 13:52:07 +0000790PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
791PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000792PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
793 const char *u /* UTF-8 encoded string */
794 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000795#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000796PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000797#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000798
799/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#define PyUnicode_CHECK_INTERNED(op) \
801 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000802
Guido van Rossumd8225182000-03-10 22:33:05 +0000803/* --- wchar_t support for platforms which support it --------------------- */
804
805#ifdef HAVE_WCHAR_H
806
Georg Brandl952867a2010-06-27 10:17:12 +0000807/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000808 size.
809
810 The buffer is copied into the new object. */
811
Mark Hammond91a681d2002-08-12 07:21:58 +0000812PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000813 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000815 );
816
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000817/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000818 most size wchar_t characters are copied.
819
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000820 Note that the resulting wchar_t string may or may not be
821 0-terminated. It is the responsibility of the caller to make sure
822 that the wchar_t string is 0-terminated in case this is required by
823 the application.
824
825 Returns the number of wchar_t characters copied (excluding a
826 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000827 error. */
828
Martin v. Löwis18e16552006-02-15 17:27:45 +0000829PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000830 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000831 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000832 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000833 );
834
Victor Stinner137c34c2010-09-29 10:25:54 +0000835/* Convert the Unicode object to a wide character string. The output string
836 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200837 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000838
839 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
840 on success. On error, returns NULL, *size is undefined and raises a
841 MemoryError. */
842
843PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000844 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000845 Py_ssize_t *size /* number of characters of the result */
846 );
847
Victor Stinner9f789e72011-10-01 03:57:28 +0200848#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200850#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851
Guido van Rossumd8225182000-03-10 22:33:05 +0000852#endif
853
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000854/* --- Unicode ordinals --------------------------------------------------- */
855
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856/* Create a Unicode Object from the given Unicode code point ordinal.
857
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000858 The ordinal must be in range(0x10000) on narrow Python builds
859 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
860 raised in case it is not.
861
862*/
863
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000864PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000865
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000866/* --- Free-list management ----------------------------------------------- */
867
868/* Clear the free list used by the Unicode implementation.
869
870 This can be used to release memory used for objects on the free
871 list back to the Python memory allocator.
872
873*/
874
875PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
876
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000877/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000878
879 Many of these APIs take two arguments encoding and errors. These
880 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000881 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000882
Georg Brandl952867a2010-06-27 10:17:12 +0000883 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000884
885 Error handling is set by errors which may also be set to NULL
886 meaning to use the default handling defined for the codec. Default
887 error handling for all builtin codecs is "strict" (ValueErrors are
888 raised).
889
890 The codecs all use a similar interface. Only deviation from the
891 generic ones are documented.
892
893*/
894
Fred Drakecb093fe2000-05-09 19:51:53 +0000895/* --- Manage the default encoding ---------------------------------------- */
896
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000897/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000898 Unicode object unicode and the size of the encoded representation
899 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000900
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000901 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000902
Victor Stinner157f83f2011-09-28 21:41:31 +0200903 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 and subsequent calls will return the same string. The memory is relased
905 when the unicodeobject is deallocated.
906
907 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
908 support the previous internal function with the same behaviour.
909
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000910 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000911 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000912
913 *** If you need to access the Unicode object as UTF-8 bytes string,
914 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000915*/
916
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000917#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000919 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000920 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000922#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000923
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000924/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000925 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
928 in the unicodeobject.
929
930 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
931 support the previous internal function with the same behaviour.
932
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000933 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000934 extracted from the returned data.
935
936 *** This API is for interpreter INTERNAL USE ONLY and will likely
937 *** be removed or changed for Python 3.1.
938
939 *** If you need to access the Unicode object as UTF-8 bytes string,
940 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000941
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000942*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000943
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000944#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
946#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000947#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000948
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000949/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000950
Mark Hammond91a681d2002-08-12 07:21:58 +0000951PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000952
Guido van Rossumd8225182000-03-10 22:33:05 +0000953/* --- Generic Codecs ----------------------------------------------------- */
954
955/* Create a Unicode object by decoding the encoded string s of the
956 given size. */
957
Mark Hammond91a681d2002-08-12 07:21:58 +0000958PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000959 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000961 const char *encoding, /* encoding */
962 const char *errors /* error handling */
963 );
964
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000965/* Decode a Unicode object unicode and return the result as Python
966 object. */
967
968PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000969 PyObject *unicode, /* Unicode object */
970 const char *encoding, /* encoding */
971 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000972 );
973
974/* Decode a Unicode object unicode and return the result as Unicode
975 object. */
976
977PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000978 PyObject *unicode, /* Unicode object */
979 const char *encoding, /* encoding */
980 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000981 );
982
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000983/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000984 Python string object. */
985
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000986#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000987PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000988 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000989 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000990 const char *encoding, /* encoding */
991 const char *errors /* error handling */
992 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000993#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000994
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000995/* Encodes a Unicode object and returns the result as Python
996 object. */
997
998PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 PyObject *unicode, /* Unicode object */
1000 const char *encoding, /* encoding */
1001 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001002 );
1003
Guido van Rossumd8225182000-03-10 22:33:05 +00001004/* Encodes a Unicode object and returns the result as Python string
1005 object. */
1006
Mark Hammond91a681d2002-08-12 07:21:58 +00001007PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 PyObject *unicode, /* Unicode object */
1009 const char *encoding, /* encoding */
1010 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 );
1012
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001013/* Encodes a Unicode object and returns the result as Unicode
1014 object. */
1015
1016PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 PyObject *unicode, /* Unicode object */
1018 const char *encoding, /* encoding */
1019 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001020 );
1021
1022/* Build an encoding map. */
1023
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001024PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1025 PyObject* string /* 256 character map */
1026 );
1027
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028/* --- UTF-7 Codecs ------------------------------------------------------- */
1029
Mark Hammond91a681d2002-08-12 07:21:58 +00001030PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001031 const char *string, /* UTF-7 encoded string */
1032 Py_ssize_t length, /* size of string */
1033 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001034 );
1035
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001036PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037 const char *string, /* UTF-7 encoded string */
1038 Py_ssize_t length, /* size of string */
1039 const char *errors, /* error handling */
1040 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001041 );
1042
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001043#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001044PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001045 const Py_UNICODE *data, /* Unicode char buffer */
1046 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1047 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1048 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1049 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001050 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001051#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052
Guido van Rossumd8225182000-03-10 22:33:05 +00001053/* --- UTF-8 Codecs ------------------------------------------------------- */
1054
Mark Hammond91a681d2002-08-12 07:21:58 +00001055PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 const char *string, /* UTF-8 encoded string */
1057 Py_ssize_t length, /* size of string */
1058 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001059 );
1060
Walter Dörwald69652032004-09-07 20:24:22 +00001061PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 const char *string, /* UTF-8 encoded string */
1063 Py_ssize_t length, /* size of string */
1064 const char *errors, /* error handling */
1065 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001066 );
1067
Mark Hammond91a681d2002-08-12 07:21:58 +00001068PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001070 );
1071
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001072#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1074 PyObject *unicode,
1075 const char *errors);
1076
Mark Hammond91a681d2002-08-12 07:21:58 +00001077PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 const Py_UNICODE *data, /* Unicode char buffer */
1079 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1080 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001081 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001082#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001083
Walter Dörwald41980ca2007-08-16 21:55:45 +00001084/* --- UTF-32 Codecs ------------------------------------------------------ */
1085
1086/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1087 the corresponding Unicode object.
1088
1089 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001091
1092 If byteorder is non-NULL, the decoder starts decoding using the
1093 given byte order:
1094
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 *byteorder == -1: little endian
1096 *byteorder == 0: native order
1097 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001098
1099 In native mode, the first four bytes of the stream are checked for a
1100 BOM mark. If found, the BOM mark is analysed, the byte order
1101 adjusted and the BOM skipped. In the other modes, no BOM mark
1102 interpretation is done. After completion, *byteorder is set to the
1103 current byte order at the end of input data.
1104
1105 If byteorder is NULL, the codec starts in native order mode.
1106
1107*/
1108
1109PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 const char *string, /* UTF-32 encoded string */
1111 Py_ssize_t length, /* size of string */
1112 const char *errors, /* error handling */
1113 int *byteorder /* pointer to byteorder to use
1114 0=native;-1=LE,1=BE; updated on
1115 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001116 );
1117
1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 const char *string, /* UTF-32 encoded string */
1120 Py_ssize_t length, /* size of string */
1121 const char *errors, /* error handling */
1122 int *byteorder, /* pointer to byteorder to use
1123 0=native;-1=LE,1=BE; updated on
1124 exit */
1125 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001126 );
1127
1128/* Returns a Python string using the UTF-32 encoding in native byte
1129 order. The string always starts with a BOM mark. */
1130
1131PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001132 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001133 );
1134
1135/* Returns a Python string object holding the UTF-32 encoded value of
1136 the Unicode data.
1137
1138 If byteorder is not 0, output is written according to the following
1139 byte order:
1140
1141 byteorder == -1: little endian
1142 byteorder == 0: native byte order (writes a BOM mark)
1143 byteorder == 1: big endian
1144
1145 If byteorder is 0, the output string will always start with the
1146 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1147 prepended.
1148
1149*/
1150
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001151#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001152PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 const Py_UNICODE *data, /* Unicode char buffer */
1154 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1155 const char *errors, /* error handling */
1156 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001157 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001158#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001159
Guido van Rossumd8225182000-03-10 22:33:05 +00001160/* --- UTF-16 Codecs ------------------------------------------------------ */
1161
Guido van Rossum9e896b32000-04-05 20:11:21 +00001162/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001163 the corresponding Unicode object.
1164
1165 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001167
1168 If byteorder is non-NULL, the decoder starts decoding using the
1169 given byte order:
1170
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 *byteorder == -1: little endian
1172 *byteorder == 0: native order
1173 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001174
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001175 In native mode, the first two bytes of the stream are checked for a
1176 BOM mark. If found, the BOM mark is analysed, the byte order
1177 adjusted and the BOM skipped. In the other modes, no BOM mark
1178 interpretation is done. After completion, *byteorder is set to the
1179 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001180
1181 If byteorder is NULL, the codec starts in native order mode.
1182
1183*/
1184
Mark Hammond91a681d2002-08-12 07:21:58 +00001185PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 const char *string, /* UTF-16 encoded string */
1187 Py_ssize_t length, /* size of string */
1188 const char *errors, /* error handling */
1189 int *byteorder /* pointer to byteorder to use
1190 0=native;-1=LE,1=BE; updated on
1191 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001192 );
1193
Walter Dörwald69652032004-09-07 20:24:22 +00001194PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 const char *string, /* UTF-16 encoded string */
1196 Py_ssize_t length, /* size of string */
1197 const char *errors, /* error handling */
1198 int *byteorder, /* pointer to byteorder to use
1199 0=native;-1=LE,1=BE; updated on
1200 exit */
1201 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001202 );
1203
Guido van Rossumd8225182000-03-10 22:33:05 +00001204/* Returns a Python string using the UTF-16 encoding in native byte
1205 order. The string always starts with a BOM mark. */
1206
Mark Hammond91a681d2002-08-12 07:21:58 +00001207PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001209 );
1210
1211/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001212 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001213
1214 If byteorder is not 0, output is written according to the following
1215 byte order:
1216
1217 byteorder == -1: little endian
1218 byteorder == 0: native byte order (writes a BOM mark)
1219 byteorder == 1: big endian
1220
1221 If byteorder is 0, the output string will always start with the
1222 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1223 prepended.
1224
1225 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1226 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001227 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001228
1229*/
1230
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001231#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001232PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 const Py_UNICODE *data, /* Unicode char buffer */
1234 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1235 const char *errors, /* error handling */
1236 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001237 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001238#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
1240/* --- Unicode-Escape Codecs ---------------------------------------------- */
1241
Mark Hammond91a681d2002-08-12 07:21:58 +00001242PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 const char *string, /* Unicode-Escape encoded string */
1244 Py_ssize_t length, /* size of string */
1245 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001246 );
1247
Mark Hammond91a681d2002-08-12 07:21:58 +00001248PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001250 );
1251
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001252#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001253PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 const Py_UNICODE *data, /* Unicode char buffer */
1255 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001256 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001257#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001258
1259/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1260
Mark Hammond91a681d2002-08-12 07:21:58 +00001261PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001262 const char *string, /* Raw-Unicode-Escape encoded string */
1263 Py_ssize_t length, /* size of string */
1264 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001265 );
1266
Mark Hammond91a681d2002-08-12 07:21:58 +00001267PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001269 );
1270
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001271#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001272PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 const Py_UNICODE *data, /* Unicode char buffer */
1274 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001275 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001276#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001277
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001278/* --- Unicode Internal Codec ---------------------------------------------
1279
1280 Only for internal use in _codecsmodule.c */
1281
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001282#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001283PyObject *_PyUnicode_DecodeUnicodeInternal(
1284 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001286 const char *errors
1287 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001288#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001289
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001291
1292 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1293
1294*/
1295
Mark Hammond91a681d2002-08-12 07:21:58 +00001296PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 const char *string, /* Latin-1 encoded string */
1298 Py_ssize_t length, /* size of string */
1299 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001300 );
1301
Mark Hammond91a681d2002-08-12 07:21:58 +00001302PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001304 );
1305
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1308 PyObject* unicode,
1309 const char* errors);
1310
Mark Hammond91a681d2002-08-12 07:21:58 +00001311PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 const Py_UNICODE *data, /* Unicode char buffer */
1313 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1314 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001315 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001316#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001317
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001319
1320 Only 7-bit ASCII data is excepted. All other codes generate errors.
1321
1322*/
1323
Mark Hammond91a681d2002-08-12 07:21:58 +00001324PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 const char *string, /* ASCII encoded string */
1326 Py_ssize_t length, /* size of string */
1327 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001328 );
1329
Mark Hammond91a681d2002-08-12 07:21:58 +00001330PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001332 );
1333
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001334#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1336 PyObject* unicode,
1337 const char* errors);
1338
Mark Hammond91a681d2002-08-12 07:21:58 +00001339PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 const Py_UNICODE *data, /* Unicode char buffer */
1341 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1342 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001343 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001344#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001345
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001349
1350 Decoding mappings must map single string characters to single
1351 Unicode characters, integers (which are then interpreted as Unicode
1352 ordinals) or None (meaning "undefined mapping" and causing an
1353 error).
1354
1355 Encoding mappings must map single Unicode characters to single
1356 string characters, integers (which are then interpreted as Latin-1
1357 ordinals) or None (meaning "undefined mapping" and causing an
1358 error).
1359
1360 If a character lookup fails with a LookupError, the character is
1361 copied as-is meaning that its ordinal value will be interpreted as
1362 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1363 to contain those mappings which map characters to different code
1364 points.
1365
1366*/
1367
Mark Hammond91a681d2002-08-12 07:21:58 +00001368PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369 const char *string, /* Encoded string */
1370 Py_ssize_t length, /* size of string */
1371 PyObject *mapping, /* character mapping
1372 (char ordinal -> unicode ordinal) */
1373 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001374 );
1375
Mark Hammond91a681d2002-08-12 07:21:58 +00001376PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377 PyObject *unicode, /* Unicode object */
1378 PyObject *mapping /* character mapping
1379 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001380 );
1381
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001382#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001383PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 const Py_UNICODE *data, /* Unicode char buffer */
1385 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1386 PyObject *mapping, /* character mapping
1387 (unicode ordinal -> char ordinal) */
1388 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001389 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001390#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001391
1392/* Translate a Py_UNICODE buffer of the given length by applying a
1393 character mapping table to it and return the resulting Unicode
1394 object.
1395
1396 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001398
1399 Mapping tables may be dictionaries or sequences. Unmapped character
1400 ordinals (ones which cause a LookupError) are left untouched and
1401 are copied as-is.
1402
1403*/
1404
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001405#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001406PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 const Py_UNICODE *data, /* Unicode char buffer */
1408 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1409 PyObject *table, /* Translate table */
1410 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001411 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001412#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001413
Victor Stinner99b95382011-07-04 14:23:54 +02001414#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001415
Guido van Rossumefec1152000-03-28 02:01:15 +00001416/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001417
Mark Hammond91a681d2002-08-12 07:21:58 +00001418PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001419 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001420 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001421 const char *errors /* error handling */
1422 );
1423
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001424PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1425 const char *string, /* MBCS encoded string */
1426 Py_ssize_t length, /* size of string */
1427 const char *errors, /* error handling */
1428 Py_ssize_t *consumed /* bytes consumed */
1429 );
1430
Mark Hammond91a681d2002-08-12 07:21:58 +00001431PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001432 PyObject *unicode /* Unicode object */
1433 );
1434
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001435#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001436PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001437 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001438 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001439 const char *errors /* error handling */
1440 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001441#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001442
Victor Stinner99b95382011-07-04 14:23:54 +02001443#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001444
Guido van Rossum9e896b32000-04-05 20:11:21 +00001445/* --- Decimal Encoder ---------------------------------------------------- */
1446
1447/* Takes a Unicode string holding a decimal value and writes it into
1448 an output buffer using standard ASCII digit codes.
1449
1450 The output buffer has to provide at least length+1 bytes of storage
1451 area. The output string is 0-terminated.
1452
1453 The encoder converts whitespace to ' ', decimal characters to their
1454 corresponding ASCII digit and all other Latin-1 characters except
1455 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1456 are treated as errors. This includes embedded NULL bytes.
1457
1458 Error handling is defined by the errors argument:
1459
1460 NULL or "strict": raise a ValueError
1461 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001463 "replace": replaces illegal characters with '?'
1464
1465 Returns 0 on success, -1 on failure.
1466
1467*/
1468
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001469#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001470PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 Py_UNICODE *s, /* Unicode buffer */
1472 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1473 char *output, /* Output buffer; must have size >= length */
1474 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001475 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001476#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001477
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001478/* Transforms code points that have decimal digit property to the
1479 corresponding ASCII digit code points.
1480
1481 Returns a new Unicode string on success, NULL on failure.
1482*/
1483
Georg Brandlb5503082010-12-05 11:40:48 +00001484#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001485PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1486 Py_UNICODE *s, /* Unicode buffer */
1487 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1488 );
Georg Brandlb5503082010-12-05 11:40:48 +00001489#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1492 as argument instead of a raw buffer and length. This function additionally
1493 transforms spaces to ASCII because this is what the callers in longobject,
1494 floatobject, and complexobject did anyways. */
1495
1496#ifndef Py_LIMITED_API
1497PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1498 PyObject *unicode /* Unicode object */
1499 );
1500#endif
1501
Martin v. Löwis011e8422009-05-05 04:43:17 +00001502/* --- File system encoding ---------------------------------------------- */
1503
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001504/* ParseTuple converter: encode str objects to bytes using
1505 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001506
1507PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1508
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001509/* ParseTuple converter: decode bytes objects to unicode using
1510 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1511
1512PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1513
Victor Stinner77c38622010-05-14 15:58:55 +00001514/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1515 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001516
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001517 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1518 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001519
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001520 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001521*/
1522
1523PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1524 const char *s /* encoded string */
1525 );
1526
Victor Stinner77c38622010-05-14 15:58:55 +00001527/* Decode a string using Py_FileSystemDefaultEncoding
1528 and the "surrogateescape" error handler.
1529
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001530 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1531 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001532*/
1533
Martin v. Löwis011e8422009-05-05 04:43:17 +00001534PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1535 const char *s, /* encoded string */
1536 Py_ssize_t size /* size */
1537 );
1538
Victor Stinnerae6265f2010-05-15 16:27:27 +00001539/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001540 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001541
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001542 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1543 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001544*/
1545
1546PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1547 PyObject *unicode
1548 );
1549
Guido van Rossumd8225182000-03-10 22:33:05 +00001550/* --- Methods & Slots ----------------------------------------------------
1551
1552 These are capable of handling Unicode objects and strings on input
1553 (we refer to them as strings in the descriptions) and return
1554 Unicode objects or integers as apporpriate. */
1555
1556/* Concat two strings giving a new Unicode string. */
1557
Mark Hammond91a681d2002-08-12 07:21:58 +00001558PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 PyObject *left, /* Left string */
1560 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001561 );
1562
Walter Dörwald1ab83302007-05-18 17:15:44 +00001563/* Concat two strings and put the result in *pleft
1564 (sets *pleft to NULL on error) */
1565
1566PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 PyObject **pleft, /* Pointer to left string */
1568 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001569 );
1570
1571/* Concat two strings, put the result in *pleft and drop the right object
1572 (sets *pleft to NULL on error) */
1573
1574PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 PyObject **pleft, /* Pointer to left string */
1576 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001577 );
1578
Guido van Rossumd8225182000-03-10 22:33:05 +00001579/* Split a string giving a list of Unicode strings.
1580
1581 If sep is NULL, splitting will be done at all whitespace
1582 substrings. Otherwise, splits occur at the given separator.
1583
1584 At most maxsplit splits will be done. If negative, no limit is set.
1585
1586 Separators are not included in the resulting list.
1587
1588*/
1589
Mark Hammond91a681d2002-08-12 07:21:58 +00001590PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 PyObject *s, /* String to split */
1592 PyObject *sep, /* String separator */
1593 Py_ssize_t maxsplit /* Maxsplit count */
1594 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001595
1596/* Dito, but split at line breaks.
1597
1598 CRLF is considered to be one line break. Line breaks are not
1599 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600
Mark Hammond91a681d2002-08-12 07:21:58 +00001601PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001602 PyObject *s, /* String to split */
1603 int keepends /* If true, line end markers are included */
1604 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001605
Thomas Wouters477c8d52006-05-27 19:21:47 +00001606/* Partition a string using a given separator. */
1607
1608PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001609 PyObject *s, /* String to partition */
1610 PyObject *sep /* String separator */
1611 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001612
1613/* Partition a string using a given separator, searching from the end of the
1614 string. */
1615
1616PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 PyObject *s, /* String to partition */
1618 PyObject *sep /* String separator */
1619 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001620
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001621/* Split a string giving a list of Unicode strings.
1622
1623 If sep is NULL, splitting will be done at all whitespace
1624 substrings. Otherwise, splits occur at the given separator.
1625
1626 At most maxsplit splits will be done. But unlike PyUnicode_Split
1627 PyUnicode_RSplit splits from the end of the string. If negative,
1628 no limit is set.
1629
1630 Separators are not included in the resulting list.
1631
1632*/
1633
1634PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 PyObject *s, /* String to split */
1636 PyObject *sep, /* String separator */
1637 Py_ssize_t maxsplit /* Maxsplit count */
1638 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001639
Guido van Rossumd8225182000-03-10 22:33:05 +00001640/* Translate a string by applying a character mapping table to it and
1641 return the resulting Unicode object.
1642
1643 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001644 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001645
1646 Mapping tables may be dictionaries or sequences. Unmapped character
1647 ordinals (ones which cause a LookupError) are left untouched and
1648 are copied as-is.
1649
1650*/
1651
Mark Hammond91a681d2002-08-12 07:21:58 +00001652PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 PyObject *str, /* String */
1654 PyObject *table, /* Translate table */
1655 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001656 );
1657
1658/* Join a sequence of strings using the given separator and return
1659 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660
Mark Hammond91a681d2002-08-12 07:21:58 +00001661PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 PyObject *separator, /* Separator string */
1663 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001664 );
1665
1666/* Return 1 if substr matches str[start:end] at the given tail end, 0
1667 otherwise. */
1668
Martin v. Löwis18e16552006-02-15 17:27:45 +00001669PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001670 PyObject *str, /* String */
1671 PyObject *substr, /* Prefix or Suffix string */
1672 Py_ssize_t start, /* Start index */
1673 Py_ssize_t end, /* Stop index */
1674 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001675 );
1676
1677/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001678 given search direction or -1 if not found. -2 is returned in case
1679 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001680
Martin v. Löwis18e16552006-02-15 17:27:45 +00001681PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 PyObject *str, /* String */
1683 PyObject *substr, /* Substring to find */
1684 Py_ssize_t start, /* Start index */
1685 Py_ssize_t end, /* Stop index */
1686 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001687 );
1688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689/* Like PyUnicode_Find, but search for single character only. */
1690PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1691 PyObject *str,
1692 Py_UCS4 ch,
1693 Py_ssize_t start,
1694 Py_ssize_t end,
1695 int direction
1696 );
1697
Barry Warsaw51ac5802000-03-20 16:36:48 +00001698/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001699
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 PyObject *str, /* String */
1702 PyObject *substr, /* Substring to count */
1703 Py_ssize_t start, /* Start index */
1704 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001705 );
1706
Barry Warsaw51ac5802000-03-20 16:36:48 +00001707/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001708 and return the resulting Unicode object. */
1709
Mark Hammond91a681d2002-08-12 07:21:58 +00001710PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 PyObject *str, /* String */
1712 PyObject *substr, /* Substring to find */
1713 PyObject *replstr, /* Substring to replace */
1714 Py_ssize_t maxcount /* Max. number of replacements to apply;
1715 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001716 );
1717
1718/* Compare two strings and return -1, 0, 1 for less than, equal,
1719 greater than resp. */
1720
Mark Hammond91a681d2002-08-12 07:21:58 +00001721PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 PyObject *left, /* Left string */
1723 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001724 );
1725
Martin v. Löwis5b222132007-06-10 09:51:05 +00001726PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1727 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001728 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001729 );
1730
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001731/* Rich compare two strings and return one of the following:
1732
1733 - NULL in case an exception was raised
1734 - Py_True or Py_False for successfuly comparisons
1735 - Py_NotImplemented in case the type combination is unknown
1736
1737 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1738 case the conversion of the arguments to Unicode fails with a
1739 UnicodeDecodeError.
1740
1741 Possible values for op:
1742
1743 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1744
1745*/
1746
1747PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748 PyObject *left, /* Left string */
1749 PyObject *right, /* Right string */
1750 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001751 );
1752
Thomas Wouters7e474022000-07-16 12:04:32 +00001753/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001754 the resulting Unicode string. */
1755
Mark Hammond91a681d2002-08-12 07:21:58 +00001756PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001757 PyObject *format, /* Format string */
1758 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001759 );
1760
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001761/* Checks whether element is contained in container and return 1/0
1762 accordingly.
1763
1764 element has to coerce to an one element Unicode string. -1 is
1765 returned in case of an error. */
1766
Mark Hammond91a681d2002-08-12 07:21:58 +00001767PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001768 PyObject *container, /* Container string */
1769 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001770 );
1771
Martin v. Löwis47383402007-08-15 07:32:56 +00001772/* Checks whether argument is a valid identifier. */
1773
1774PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1775
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001776#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001777/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001778PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001779 PyUnicodeObject *self,
1780 int striptype,
1781 PyObject *sepobj
1782 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001783#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001784
Eric Smith5807c412008-05-11 21:00:57 +00001785/* Using the current locale, insert the thousands grouping
1786 into the string pointed to by buffer. For the argument descriptions,
1787 see Objects/stringlib/localeutil.h */
1788
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001789#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001790PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1791 Py_ssize_t n_buffer,
1792 Py_UNICODE *digits,
1793 Py_ssize_t n_digits,
1794 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001795#endif
Eric Smith5807c412008-05-11 21:00:57 +00001796
Eric Smitha3b1ac82009-04-03 14:45:06 +00001797/* Using explicit passed-in values, insert the thousands grouping
1798 into the string pointed to by buffer. For the argument descriptions,
1799 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001800#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1802 int kind,
1803 void *buffer,
1804 Py_ssize_t n_buffer,
1805 void *digits,
1806 Py_ssize_t n_digits,
1807 Py_ssize_t min_width,
1808 const char *grouping,
1809 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001810#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001811/* === Characters Type APIs =============================================== */
1812
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001813/* Helper array used by Py_UNICODE_ISSPACE(). */
1814
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001815#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001816PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1817
Guido van Rossumd8225182000-03-10 22:33:05 +00001818/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001819 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001820
1821 These APIs are implemented in Objects/unicodectype.c.
1822
1823*/
1824
Mark Hammond91a681d2002-08-12 07:21:58 +00001825PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001826 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001827 );
1828
Mark Hammond91a681d2002-08-12 07:21:58 +00001829PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001830 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001831 );
1832
Mark Hammond91a681d2002-08-12 07:21:58 +00001833PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001834 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001835 );
1836
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001837PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001838 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001839 );
1840
1841PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001842 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001843 );
1844
Mark Hammond91a681d2002-08-12 07:21:58 +00001845PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001846 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001847 );
1848
Mark Hammond91a681d2002-08-12 07:21:58 +00001849PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001850 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001851 );
1852
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001853PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1854 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001855 );
1856
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001857PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1858 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001859 );
1860
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001861PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1862 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001863 );
1864
Mark Hammond91a681d2002-08-12 07:21:58 +00001865PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001866 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001867 );
1868
Mark Hammond91a681d2002-08-12 07:21:58 +00001869PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001870 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001871 );
1872
Mark Hammond91a681d2002-08-12 07:21:58 +00001873PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001874 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001875 );
1876
Mark Hammond91a681d2002-08-12 07:21:58 +00001877PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001878 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001879 );
1880
Mark Hammond91a681d2002-08-12 07:21:58 +00001881PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001882 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001883 );
1884
Mark Hammond91a681d2002-08-12 07:21:58 +00001885PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001886 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001887 );
1888
Georg Brandl559e5d72008-06-11 18:37:52 +00001889PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001890 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001891 );
1892
Mark Hammond91a681d2002-08-12 07:21:58 +00001893PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001894 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001895 );
1896
Victor Stinneref8d95c2010-08-16 22:03:11 +00001897PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1898 const Py_UNICODE *u
1899 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001900
1901PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001902 Py_UNICODE *s1,
1903 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001904
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001905PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1906 Py_UNICODE *s1, const Py_UNICODE *s2);
1907
Martin v. Löwis5b222132007-06-10 09:51:05 +00001908PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001909 Py_UNICODE *s1,
1910 const Py_UNICODE *s2,
1911 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001912
1913PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001914 const Py_UNICODE *s1,
1915 const Py_UNICODE *s2
1916 );
1917
1918PyAPI_FUNC(int) Py_UNICODE_strncmp(
1919 const Py_UNICODE *s1,
1920 const Py_UNICODE *s2,
1921 size_t n
1922 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001923
1924PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001925 const Py_UNICODE *s,
1926 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001927 );
1928
Victor Stinner331ea922010-08-10 16:37:20 +00001929PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001930 const Py_UNICODE *s,
1931 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001932 );
1933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934PyAPI_FUNC(size_t) Py_UCS4_strlen(
1935 const Py_UCS4 *u
1936 );
1937
1938PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1939 Py_UCS4 *s1,
1940 const Py_UCS4 *s2);
1941
1942PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1943 Py_UCS4 *s1, const Py_UCS4 *s2);
1944
1945PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1946 Py_UCS4 *s1,
1947 const Py_UCS4 *s2,
1948 size_t n);
1949
1950PyAPI_FUNC(int) Py_UCS4_strcmp(
1951 const Py_UCS4 *s1,
1952 const Py_UCS4 *s2
1953 );
1954
1955PyAPI_FUNC(int) Py_UCS4_strncmp(
1956 const Py_UCS4 *s1,
1957 const Py_UCS4 *s2,
1958 size_t n
1959 );
1960
1961PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1962 const Py_UCS4 *s,
1963 Py_UCS4 c
1964 );
1965
1966PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1967 const Py_UCS4 *s,
1968 Py_UCS4 c
1969 );
1970
Victor Stinner71133ff2010-09-01 23:43:53 +00001971/* Create a copy of a unicode string ending with a nul character. Return NULL
1972 and raise a MemoryError exception on memory allocation failure, otherwise
1973 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1974
Victor Stinner46408602010-09-03 16:18:00 +00001975PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001976 PyObject *unicode
1977 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001978#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001979
Guido van Rossumd8225182000-03-10 22:33:05 +00001980#ifdef __cplusplus
1981}
1982#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001983#endif /* !Py_UNICODEOBJECT_H */