blob: 0c2b488b5e1415f0a234abc3326b3421f937684e [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200209 /* There a 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
291 PyUnicode_WCHAR_KIND (0): wchar_t*
292 PyUnicode_1BYTE_KIND (1): Py_UCS1*
293 PyUnicode_2BYTE_KIND (2): Py_UCS2*
294 PyUnicode_4BYTE_KIND (3): Py_UCS4*
295 */
296 unsigned int kind:2;
297 /* Compact is with respect to the allocation scheme. Compact unicode
298 objects only require one memory block while non-compact objects use
299 one block for the PyUnicodeObject struct and another for its data
300 buffer. */
301 unsigned int compact:1;
Victor Stinnera3b334d2011-10-03 13:53:37 +0200302 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
303 characters. If ascii is 1 and compact is 1, use the PyASCIIObject
304 structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200305 unsigned int ascii:1;
306 /* The ready flag indicates whether the object layout is initialized
307 completely. This means that this is either a compact object, or
308 the data pointer is filled out. The bit is redundant, and helps
309 to minimize the test in PyUnicode_IS_READY(). */
310 unsigned int ready:1;
311 } state;
312 wchar_t *wstr; /* wchar_t representation (null-terminated) */
313} PyASCIIObject;
314
315/* Non-ASCII strings allocated through PyUnicode_New use the
316 PyCompactUnicodeOject structure. state.compact is set, and the data
317 immediately follow the structure. */
318typedef struct {
319 PyASCIIObject _base;
320 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
321 * terminating \0. */
322 char *utf8; /* UTF-8 representation (null-terminated) */
323 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
324 * surrogates count as two code points. */
325} PyCompactUnicodeObject;
326
327/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
328 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200329 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330typedef struct {
331 PyCompactUnicodeObject _base;
332 union {
333 void *any;
334 Py_UCS1 *latin1;
335 Py_UCS2 *ucs2;
336 Py_UCS4 *ucs4;
337 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000338} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000339#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000340
Mark Hammond91a681d2002-08-12 07:21:58 +0000341PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000342PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000343
Thomas Wouters27d517b2007-02-25 20:39:11 +0000344#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000345 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
346#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000347
348/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000349#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350
351#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200352 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353 ((PyASCIIObject*)op)->length : \
354 ((PyCompactUnicodeObject*)op)->wstr_length)
355
356/* Returns the deprecated Py_UNICODE representation's size in code units
357 (this includes surrogate pairs as 2 units).
358 If the Py_UNICODE representation is not available, it will be computed
359 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
360
Guido van Rossumd8225182000-03-10 22:33:05 +0000361#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 (assert(PyUnicode_Check(op)), \
363 (((PyASCIIObject *)(op))->wstr) ? \
364 PyUnicode_WSTR_LENGTH(op) : \
365 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
366 PyUnicode_WSTR_LENGTH(op)))
367
Guido van Rossumd8225182000-03-10 22:33:05 +0000368#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
370
371/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
372 representation on demand. Using this macro is very inefficient now,
373 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
374 use PyUnicode_WRITE() and PyUnicode_READ(). */
375
Guido van Rossumd8225182000-03-10 22:33:05 +0000376#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 (assert(PyUnicode_Check(op)), \
378 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
379 PyUnicode_AsUnicode((PyObject *)(op)))
380
Guido van Rossumd8225182000-03-10 22:33:05 +0000381#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 ((const char *)(PyUnicode_AS_UNICODE(op)))
383
384
385/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
386
387/* Values for PyUnicodeObject.state: */
388
389/* Interning state. */
390#define SSTATE_NOT_INTERNED 0
391#define SSTATE_INTERNED_MORTAL 1
392#define SSTATE_INTERNED_IMMORTAL 2
393
Victor Stinnera3b334d2011-10-03 13:53:37 +0200394/* Return true if the string contains only ASCII characters, or 0 if not. The
395 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
396 or Ready calls are performed. */
397#define PyUnicode_IS_ASCII(op) \
398 (((PyASCIIObject*)op)->state.ascii)
399
400/* Return true if the string is compact or 0 if not.
401 No type checks or Ready calls are performed. */
402#define PyUnicode_IS_COMPACT(op) \
403 (((PyASCIIObject*)(op))->state.compact)
404
405/* Return true if the string is a compact ASCII string (use PyASCIIObject
406 structure), or 0 if not. No type checks or Ready calls are performed. */
407#define PyUnicode_IS_COMPACT_ASCII(op) \
408 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409
410/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200412 has not been called yet. */
413#define PyUnicode_WCHAR_KIND 0
414
415/* Return values of the PyUnicode_KIND() macro: */
416
417#define PyUnicode_1BYTE_KIND 1
418#define PyUnicode_2BYTE_KIND 2
419#define PyUnicode_4BYTE_KIND 3
420
421
422/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200423 this can be 1, 2 or 4.
424
425 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200426#define PyUnicode_CHARACTER_SIZE(op) \
427 (1 << (PyUnicode_KIND(op) - 1))
428
429/* Return pointers to the canonical representation casted as unsigned char,
430 Py_UCS2, or Py_UCS4 for direct character access.
431 No checks are performed, use PyUnicode_CHARACTER_SIZE or
432 PyUnicode_KIND() before to ensure these will work correctly. */
433
434#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
435#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
436#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
437
Victor Stinner157f83f2011-09-28 21:41:31 +0200438/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439#define PyUnicode_KIND(op) \
440 (assert(PyUnicode_Check(op)), \
441 assert(PyUnicode_IS_READY(op)), \
442 ((PyASCIIObject *)(op))->state.kind)
443
Victor Stinner157f83f2011-09-28 21:41:31 +0200444/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445#define _PyUnicode_COMPACT_DATA(op) \
446 (PyUnicode_IS_COMPACT_ASCII(op) ? \
447 ((void*)((PyASCIIObject*)(op) + 1)) : \
448 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
449
450#define _PyUnicode_NONCOMPACT_DATA(op) \
451 (assert(((PyUnicodeObject*)(op))->data.any), \
452 ((((PyUnicodeObject *)(op))->data.any)))
453
454#define PyUnicode_DATA(op) \
455 (assert(PyUnicode_Check(op)), \
456 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
457 _PyUnicode_NONCOMPACT_DATA(op))
458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200460 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461
Victor Stinner4584a5b2011-10-01 02:39:37 +0200462 See also PyUnicode_CHARACTER_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
464
465/* In the access macros below, "kind" may be evaluated more than once.
466 All other macro parameters are evaluated exactly once, so it is safe
467 to put side effects into them (such as increasing the index). */
468
469/* Write into the canonical representation, this macro does not do any sanity
470 checks and is intended for usage in loops. The caller should cache the
471 kind and data pointers optained form other macro calls.
472 index is the index in the string (starts at 0) and value is the new
473 code point value which shoule be written to that location. */
474#define PyUnicode_WRITE(kind, data, index, value) \
475 do { \
476 switch ((kind)) { \
477 case PyUnicode_1BYTE_KIND: { \
478 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
479 break; \
480 } \
481 case PyUnicode_2BYTE_KIND: { \
482 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
483 break; \
484 } \
485 default: { \
486 assert((kind) == PyUnicode_4BYTE_KIND); \
487 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
488 } \
489 } \
490 } while (0)
491
492/* Read a code point form the string's canonical representation. No checks
493 or ready calls are performed. */
494#define PyUnicode_READ(kind, data, index) \
495 ((Py_UCS4) \
496 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200497 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498 ((kind) == PyUnicode_2BYTE_KIND ? \
499 ((const Py_UCS2 *)(data))[(index)] : \
500 ((const Py_UCS4 *)(data))[(index)] \
501 ) \
502 ))
503
504/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
505 calls PyUnicode_KIND() and might call it twice. For single reads, use
506 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
507 cache kind and use PyUnicode_READ instead. */
508#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200509 (assert(PyUnicode_Check(unicode)), \
510 assert(PyUnicode_IS_READY(unicode)), \
511 (Py_UCS4) \
512 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
513 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
514 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
515 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
516 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
517 ) \
518 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200519
520/* Returns the length of the unicode string. The caller has to make sure that
521 the string has it's canonical representation set before calling
522 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
523#define PyUnicode_GET_LENGTH(op) \
524 (assert(PyUnicode_Check(op)), \
525 assert(PyUnicode_IS_READY(op)), \
526 ((PyASCIIObject *)(op))->length)
527
528
529/* Fast check to determine whether an object is ready. Equivalent to
530 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
531
532#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
533
Victor Stinnera3b334d2011-10-03 13:53:37 +0200534/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200536 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537 Returns 0 on success and -1 on errors. */
538#define PyUnicode_READY(op) \
539 (assert(PyUnicode_Check(op)), \
540 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200541 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543/* Return a maximum character value which is suitable for creating another
544 string based on op. This is always an approximation but more efficient
545 than interating over the string. */
546#define PyUnicode_MAX_CHAR_VALUE(op) \
547 (assert(PyUnicode_IS_READY(op)), \
548 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
549 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
550 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
551 (0x7fU) : (0xffU) \
552 ) : \
553 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
554 (0xffffU) : (0x10ffffU) \
555 ))))
556
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000557#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000558
559/* --- Constants ---------------------------------------------------------- */
560
561/* This Unicode character will be used as replacement character during
562 decoding if the errors argument is set to "replace". Note: the
563 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
564 Unicode 3.0. */
565
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200566#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000567
568/* === Public API ========================================================= */
569
570/* --- Plain Py_UNICODE --------------------------------------------------- */
571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572/* With PEP 393, this is the recommended way to allocate a new unicode object.
573 This function will allocate the object and its buffer in a single memory
574 block. Objects created using this function are not resizable. */
575#ifndef Py_LIMITED_API
576PyAPI_FUNC(PyObject*) PyUnicode_New(
577 Py_ssize_t size, /* Number of code points in the new string */
578 Py_UCS4 maxchar /* maximum code point value in the string */
579 );
580#endif
581
Victor Stinnerd8f65102011-09-29 19:43:17 +0200582/* Initializes the canonical string representation from a the deprecated
583 wstr/Py_UNICODE representation. This function is used to convert Unicode
584 objects which were created using the old API to the new flexible format
585 introduced with PEP 393.
586
587 Don't call this function directly, use the public PyUnicode_READY() macro
588 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589#ifndef Py_LIMITED_API
590PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200591 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 );
593#endif
594
Victor Stinner034f6cf2011-09-30 02:26:44 +0200595/* Get a copy of a Unicode string. */
596PyAPI_FUNC(PyObject*) PyUnicode_Copy(
597 PyObject *unicode
598 );
599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200600/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200601 character conversion when necessary and falls back to memcpy if possible.
602
Victor Stinnera0702ab2011-09-29 14:14:38 +0200603 Fail if to is too small (smaller than how_many or smaller than
604 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
605 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200606
607 Return the number of written character, or return -1 and raise an exception
608 on error.
609
610 Pseudo-code:
611
612 how_many = min(how_many, len(from) - from_start)
613 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
614 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200615
616 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200617 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200619PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620 PyObject *to,
621 Py_ssize_t to_start,
622 PyObject *from,
623 Py_ssize_t from_start,
624 Py_ssize_t how_many
625 );
626#endif
627
Guido van Rossumd8225182000-03-10 22:33:05 +0000628/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000629 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000630
631 u may be NULL which causes the contents to be undefined. It is the
632 user's responsibility to fill in the needed data afterwards. Note
633 that modifying the Unicode object contents after construction is
634 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000635
636 The buffer is copied into the new object. */
637
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000638#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000639PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000640 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000641 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000642 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000643#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000644
Georg Brandl952867a2010-06-27 10:17:12 +0000645/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000647 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000648 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000649 );
650
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000651/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200652 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000653PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000654 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000655 );
656
Victor Stinnerb9275c12011-10-05 14:01:42 +0200657/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
658 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659#ifndef Py_LIMITED_API
660PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
661 int kind,
662 const void *buffer,
663 Py_ssize_t size);
664#endif
665
666PyAPI_FUNC(PyObject*) PyUnicode_Substring(
667 PyObject *str,
668 Py_ssize_t start,
669 Py_ssize_t end);
670
671/* Copy the string into a UCS4 buffer including the null character is copy_null
672 is set. Return NULL and raise an exception on error. Raise a ValueError if
673 the buffer is smaller than the string. Return buffer on success.
674
675 buflen is the length of the buffer in (Py_UCS4) characters. */
676PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
677 PyObject *unicode,
678 Py_UCS4* buffer,
679 Py_ssize_t buflen,
680 int copy_null);
681
682/* Copy the string into a UCS4 buffer. A new buffer is allocated using
683 * PyMem_Malloc; if this fails, NULL is returned with a memory error
684 exception set. */
685PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
686
Guido van Rossumd8225182000-03-10 22:33:05 +0000687/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688 Py_UNICODE buffer.
689 If the wchar_t/Py_UNICODE representation is not yet available, this
690 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000691
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000692#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000693PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000695 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000696#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200698/* Return a read-only pointer to the Unicode object's internal
699 Py_UNICODE buffer and save the length at size.
700 If the wchar_t/Py_UNICODE representation is not yet available, this
701 function will calculate it. */
702
703#ifndef Py_LIMITED_API
704PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
705 PyObject *unicode, /* Unicode object */
706 Py_ssize_t *size /* location where to save the length */
707 );
708#endif
709
Guido van Rossumd8225182000-03-10 22:33:05 +0000710/* Get the length of the Unicode object. */
711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
713 PyObject *unicode
714);
715
Victor Stinner157f83f2011-09-28 21:41:31 +0200716/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 string representation. */
718
Martin v. Löwis18e16552006-02-15 17:27:45 +0000719PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000721 );
722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723/* Read a character from the string. */
724
725PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
726 PyObject *unicode,
727 Py_ssize_t index
728 );
729
730/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200731 PyUnicode_New, must not be shared, and must not have been hashed yet.
732
733 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734
735PyAPI_FUNC(int) PyUnicode_WriteChar(
736 PyObject *unicode,
737 Py_ssize_t index,
738 Py_UCS4 character
739 );
740
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000741#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000742/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000743PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000744#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000745
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200746/* Resize an Unicode object allocated by the legacy API (e.g.
747 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
748 PyUnicode_New) cannot be resized by this function.
749
750 The length is a number of Py_UNICODE characters (and not the number of code
751 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000752
753 *unicode is modified to point to the new (resized) object and 0
754 returned on success.
755
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200756 If the refcount on the object is 1, the function resizes the string in
757 place, which is usually faster than allocating a new string (and copy
758 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000759
760 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200761 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000762
Mark Hammond91a681d2002-08-12 07:21:58 +0000763PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 PyObject **unicode, /* Pointer to the Unicode object */
765 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000766 );
767
Guido van Rossumd8225182000-03-10 22:33:05 +0000768/* Coerce obj to an Unicode object and return a reference with
769 *incremented* refcount.
770
771 Coercion is done in the following way:
772
Georg Brandl952867a2010-06-27 10:17:12 +0000773 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000774 under the assumptions that they contain data using the UTF-8
775 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000776
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000777 2. All other objects (including Unicode objects) raise an
778 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000779
780 The API returns NULL in case of an error. The caller is responsible
781 for decref'ing the returned objects.
782
783*/
784
Mark Hammond91a681d2002-08-12 07:21:58 +0000785PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000787 const char *encoding, /* encoding */
788 const char *errors /* error handling */
789 );
790
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000791/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000792 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000794 Unicode objects are passed back as-is (subclasses are converted to
795 true Unicode objects), all other objects are delegated to
796 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000797 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000798
799 The API returns NULL in case of an error. The caller is responsible
800 for decref'ing the returned objects.
801
802*/
803
Mark Hammond91a681d2002-08-12 07:21:58 +0000804PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000806 );
807
Victor Stinner1205f272010-09-11 00:54:47 +0000808PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
809 const char *format, /* ASCII-encoded string */
810 va_list vargs
811 );
812PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
813 const char *format, /* ASCII-encoded string */
814 ...
815 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000817#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000818/* Format the object based on the format_spec, as defined in PEP 3101
819 (Advanced String Formatting). */
820PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 PyObject *format_spec,
822 Py_ssize_t start,
823 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000824#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000825
Walter Dörwald16807132007-05-25 13:52:07 +0000826PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
827PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000828PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
829 const char *u /* UTF-8 encoded string */
830 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000831#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000832PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000833#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000834
835/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#define PyUnicode_CHECK_INTERNED(op) \
837 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000838
Guido van Rossumd8225182000-03-10 22:33:05 +0000839/* --- wchar_t support for platforms which support it --------------------- */
840
841#ifdef HAVE_WCHAR_H
842
Georg Brandl952867a2010-06-27 10:17:12 +0000843/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000844 size.
845
846 The buffer is copied into the new object. */
847
Mark Hammond91a681d2002-08-12 07:21:58 +0000848PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000849 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000850 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000851 );
852
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000853/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000854 most size wchar_t characters are copied.
855
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000856 Note that the resulting wchar_t string may or may not be
857 0-terminated. It is the responsibility of the caller to make sure
858 that the wchar_t string is 0-terminated in case this is required by
859 the application.
860
861 Returns the number of wchar_t characters copied (excluding a
862 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000863 error. */
864
Martin v. Löwis18e16552006-02-15 17:27:45 +0000865PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000866 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000868 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000869 );
870
Victor Stinner137c34c2010-09-29 10:25:54 +0000871/* Convert the Unicode object to a wide character string. The output string
872 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200873 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000874
875 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
876 on success. On error, returns NULL, *size is undefined and raises a
877 MemoryError. */
878
879PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000880 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000881 Py_ssize_t *size /* number of characters of the result */
882 );
883
Victor Stinner9f789e72011-10-01 03:57:28 +0200884#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200886#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887
Guido van Rossumd8225182000-03-10 22:33:05 +0000888#endif
889
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000890/* --- Unicode ordinals --------------------------------------------------- */
891
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892/* Create a Unicode Object from the given Unicode code point ordinal.
893
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000894 The ordinal must be in range(0x10000) on narrow Python builds
895 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
896 raised in case it is not.
897
898*/
899
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000900PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000901
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000902/* --- Free-list management ----------------------------------------------- */
903
904/* Clear the free list used by the Unicode implementation.
905
906 This can be used to release memory used for objects on the free
907 list back to the Python memory allocator.
908
909*/
910
911PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
912
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000913/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000914
915 Many of these APIs take two arguments encoding and errors. These
916 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000917 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000918
Georg Brandl952867a2010-06-27 10:17:12 +0000919 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000920
921 Error handling is set by errors which may also be set to NULL
922 meaning to use the default handling defined for the codec. Default
923 error handling for all builtin codecs is "strict" (ValueErrors are
924 raised).
925
926 The codecs all use a similar interface. Only deviation from the
927 generic ones are documented.
928
929*/
930
Fred Drakecb093fe2000-05-09 19:51:53 +0000931/* --- Manage the default encoding ---------------------------------------- */
932
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000933/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000934 Unicode object unicode and the size of the encoded representation
935 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000936
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000937 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000938
Victor Stinner157f83f2011-09-28 21:41:31 +0200939 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940 and subsequent calls will return the same string. The memory is relased
941 when the unicodeobject is deallocated.
942
943 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
944 support the previous internal function with the same behaviour.
945
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000946 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000947 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000948
949 *** If you need to access the Unicode object as UTF-8 bytes string,
950 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000951*/
952
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000953#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000956 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000958#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000959
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000960/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000961 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
964 in the unicodeobject.
965
966 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
967 support the previous internal function with the same behaviour.
968
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000969 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000970 extracted from the returned data.
971
972 *** This API is for interpreter INTERNAL USE ONLY and will likely
973 *** be removed or changed for Python 3.1.
974
975 *** If you need to access the Unicode object as UTF-8 bytes string,
976 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000977
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000978*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000979
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000980#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
982#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000983#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000984
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000985/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000986
Mark Hammond91a681d2002-08-12 07:21:58 +0000987PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000988
Guido van Rossumd8225182000-03-10 22:33:05 +0000989/* --- Generic Codecs ----------------------------------------------------- */
990
991/* Create a Unicode object by decoding the encoded string s of the
992 given size. */
993
Mark Hammond91a681d2002-08-12 07:21:58 +0000994PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000995 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000996 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000997 const char *encoding, /* encoding */
998 const char *errors /* error handling */
999 );
1000
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001001/* Decode a Unicode object unicode and return the result as Python
1002 object. */
1003
1004PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005 PyObject *unicode, /* Unicode object */
1006 const char *encoding, /* encoding */
1007 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001008 );
1009
1010/* Decode a Unicode object unicode and return the result as Unicode
1011 object. */
1012
1013PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 PyObject *unicode, /* Unicode object */
1015 const char *encoding, /* encoding */
1016 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001017 );
1018
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001020 Python string object. */
1021
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001022#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001023PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001025 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001026 const char *encoding, /* encoding */
1027 const char *errors /* error handling */
1028 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001029#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001030
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001031/* Encodes a Unicode object and returns the result as Python
1032 object. */
1033
1034PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001035 PyObject *unicode, /* Unicode object */
1036 const char *encoding, /* encoding */
1037 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001038 );
1039
Guido van Rossumd8225182000-03-10 22:33:05 +00001040/* Encodes a Unicode object and returns the result as Python string
1041 object. */
1042
Mark Hammond91a681d2002-08-12 07:21:58 +00001043PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001044 PyObject *unicode, /* Unicode object */
1045 const char *encoding, /* encoding */
1046 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001047 );
1048
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001049/* Encodes a Unicode object and returns the result as Unicode
1050 object. */
1051
1052PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053 PyObject *unicode, /* Unicode object */
1054 const char *encoding, /* encoding */
1055 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001056 );
1057
1058/* Build an encoding map. */
1059
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001060PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1061 PyObject* string /* 256 character map */
1062 );
1063
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064/* --- UTF-7 Codecs ------------------------------------------------------- */
1065
Mark Hammond91a681d2002-08-12 07:21:58 +00001066PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001067 const char *string, /* UTF-7 encoded string */
1068 Py_ssize_t length, /* size of string */
1069 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 );
1071
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001072PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 const char *string, /* UTF-7 encoded string */
1074 Py_ssize_t length, /* size of string */
1075 const char *errors, /* error handling */
1076 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001077 );
1078
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001079#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001080PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081 const Py_UNICODE *data, /* Unicode char buffer */
1082 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1083 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1084 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1085 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001087#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088
Guido van Rossumd8225182000-03-10 22:33:05 +00001089/* --- UTF-8 Codecs ------------------------------------------------------- */
1090
Mark Hammond91a681d2002-08-12 07:21:58 +00001091PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 const char *string, /* UTF-8 encoded string */
1093 Py_ssize_t length, /* size of string */
1094 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001095 );
1096
Walter Dörwald69652032004-09-07 20:24:22 +00001097PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 const char *string, /* UTF-8 encoded string */
1099 Py_ssize_t length, /* size of string */
1100 const char *errors, /* error handling */
1101 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001102 );
1103
Mark Hammond91a681d2002-08-12 07:21:58 +00001104PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001106 );
1107
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001108#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1110 PyObject *unicode,
1111 const char *errors);
1112
Mark Hammond91a681d2002-08-12 07:21:58 +00001113PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001114 const Py_UNICODE *data, /* Unicode char buffer */
1115 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1116 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001117 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001118#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001119
Walter Dörwald41980ca2007-08-16 21:55:45 +00001120/* --- UTF-32 Codecs ------------------------------------------------------ */
1121
1122/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1123 the corresponding Unicode object.
1124
1125 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001127
1128 If byteorder is non-NULL, the decoder starts decoding using the
1129 given byte order:
1130
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 *byteorder == -1: little endian
1132 *byteorder == 0: native order
1133 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001134
1135 In native mode, the first four bytes of the stream are checked for a
1136 BOM mark. If found, the BOM mark is analysed, the byte order
1137 adjusted and the BOM skipped. In the other modes, no BOM mark
1138 interpretation is done. After completion, *byteorder is set to the
1139 current byte order at the end of input data.
1140
1141 If byteorder is NULL, the codec starts in native order mode.
1142
1143*/
1144
1145PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001146 const char *string, /* UTF-32 encoded string */
1147 Py_ssize_t length, /* size of string */
1148 const char *errors, /* error handling */
1149 int *byteorder /* pointer to byteorder to use
1150 0=native;-1=LE,1=BE; updated on
1151 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001152 );
1153
1154PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 const char *string, /* UTF-32 encoded string */
1156 Py_ssize_t length, /* size of string */
1157 const char *errors, /* error handling */
1158 int *byteorder, /* pointer to byteorder to use
1159 0=native;-1=LE,1=BE; updated on
1160 exit */
1161 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001162 );
1163
1164/* Returns a Python string using the UTF-32 encoding in native byte
1165 order. The string always starts with a BOM mark. */
1166
1167PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001169 );
1170
1171/* Returns a Python string object holding the UTF-32 encoded value of
1172 the Unicode data.
1173
1174 If byteorder is not 0, output is written according to the following
1175 byte order:
1176
1177 byteorder == -1: little endian
1178 byteorder == 0: native byte order (writes a BOM mark)
1179 byteorder == 1: big endian
1180
1181 If byteorder is 0, the output string will always start with the
1182 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1183 prepended.
1184
1185*/
1186
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001187#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001188PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 const Py_UNICODE *data, /* Unicode char buffer */
1190 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1191 const char *errors, /* error handling */
1192 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001193 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001194#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001195
Guido van Rossumd8225182000-03-10 22:33:05 +00001196/* --- UTF-16 Codecs ------------------------------------------------------ */
1197
Guido van Rossum9e896b32000-04-05 20:11:21 +00001198/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001199 the corresponding Unicode object.
1200
1201 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001203
1204 If byteorder is non-NULL, the decoder starts decoding using the
1205 given byte order:
1206
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 *byteorder == -1: little endian
1208 *byteorder == 0: native order
1209 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001210
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001211 In native mode, the first two bytes of the stream are checked for a
1212 BOM mark. If found, the BOM mark is analysed, the byte order
1213 adjusted and the BOM skipped. In the other modes, no BOM mark
1214 interpretation is done. After completion, *byteorder is set to the
1215 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001216
1217 If byteorder is NULL, the codec starts in native order mode.
1218
1219*/
1220
Mark Hammond91a681d2002-08-12 07:21:58 +00001221PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 const char *string, /* UTF-16 encoded string */
1223 Py_ssize_t length, /* size of string */
1224 const char *errors, /* error handling */
1225 int *byteorder /* pointer to byteorder to use
1226 0=native;-1=LE,1=BE; updated on
1227 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001228 );
1229
Walter Dörwald69652032004-09-07 20:24:22 +00001230PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 const char *string, /* UTF-16 encoded string */
1232 Py_ssize_t length, /* size of string */
1233 const char *errors, /* error handling */
1234 int *byteorder, /* pointer to byteorder to use
1235 0=native;-1=LE,1=BE; updated on
1236 exit */
1237 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001238 );
1239
Guido van Rossumd8225182000-03-10 22:33:05 +00001240/* Returns a Python string using the UTF-16 encoding in native byte
1241 order. The string always starts with a BOM mark. */
1242
Mark Hammond91a681d2002-08-12 07:21:58 +00001243PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001245 );
1246
1247/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001248 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001249
1250 If byteorder is not 0, output is written according to the following
1251 byte order:
1252
1253 byteorder == -1: little endian
1254 byteorder == 0: native byte order (writes a BOM mark)
1255 byteorder == 1: big endian
1256
1257 If byteorder is 0, the output string will always start with the
1258 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1259 prepended.
1260
1261 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1262 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001263 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001264
1265*/
1266
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001267#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001268PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 const Py_UNICODE *data, /* Unicode char buffer */
1270 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1271 const char *errors, /* error handling */
1272 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001273 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001274#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001275
1276/* --- Unicode-Escape Codecs ---------------------------------------------- */
1277
Mark Hammond91a681d2002-08-12 07:21:58 +00001278PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 const char *string, /* Unicode-Escape encoded string */
1280 Py_ssize_t length, /* size of string */
1281 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001282 );
1283
Mark Hammond91a681d2002-08-12 07:21:58 +00001284PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001286 );
1287
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001288#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001289PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 const Py_UNICODE *data, /* Unicode char buffer */
1291 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001292 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001294
1295/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1296
Mark Hammond91a681d2002-08-12 07:21:58 +00001297PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 const char *string, /* Raw-Unicode-Escape encoded string */
1299 Py_ssize_t length, /* size of string */
1300 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001301 );
1302
Mark Hammond91a681d2002-08-12 07:21:58 +00001303PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001305 );
1306
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001307#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001308PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 const Py_UNICODE *data, /* Unicode char buffer */
1310 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001311 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001312#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001313
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001314/* --- Unicode Internal Codec ---------------------------------------------
1315
1316 Only for internal use in _codecsmodule.c */
1317
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001318#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001319PyObject *_PyUnicode_DecodeUnicodeInternal(
1320 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001321 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001322 const char *errors
1323 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001324#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001325
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001327
1328 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1329
1330*/
1331
Mark Hammond91a681d2002-08-12 07:21:58 +00001332PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 const char *string, /* Latin-1 encoded string */
1334 Py_ssize_t length, /* size of string */
1335 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001336 );
1337
Mark Hammond91a681d2002-08-12 07:21:58 +00001338PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001340 );
1341
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1344 PyObject* unicode,
1345 const char* errors);
1346
Mark Hammond91a681d2002-08-12 07:21:58 +00001347PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 const Py_UNICODE *data, /* Unicode char buffer */
1349 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1350 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001351 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001352#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001353
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001355
1356 Only 7-bit ASCII data is excepted. All other codes generate errors.
1357
1358*/
1359
Mark Hammond91a681d2002-08-12 07:21:58 +00001360PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001361 const char *string, /* ASCII encoded string */
1362 Py_ssize_t length, /* size of string */
1363 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001364 );
1365
Mark Hammond91a681d2002-08-12 07:21:58 +00001366PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001368 );
1369
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001370#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1372 PyObject* unicode,
1373 const char* errors);
1374
Mark Hammond91a681d2002-08-12 07:21:58 +00001375PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 const Py_UNICODE *data, /* Unicode char buffer */
1377 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1378 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001379 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001380#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001383
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001385
1386 Decoding mappings must map single string characters to single
1387 Unicode characters, integers (which are then interpreted as Unicode
1388 ordinals) or None (meaning "undefined mapping" and causing an
1389 error).
1390
1391 Encoding mappings must map single Unicode characters to single
1392 string characters, integers (which are then interpreted as Latin-1
1393 ordinals) or None (meaning "undefined mapping" and causing an
1394 error).
1395
1396 If a character lookup fails with a LookupError, the character is
1397 copied as-is meaning that its ordinal value will be interpreted as
1398 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1399 to contain those mappings which map characters to different code
1400 points.
1401
1402*/
1403
Mark Hammond91a681d2002-08-12 07:21:58 +00001404PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001405 const char *string, /* Encoded string */
1406 Py_ssize_t length, /* size of string */
1407 PyObject *mapping, /* character mapping
1408 (char ordinal -> unicode ordinal) */
1409 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001410 );
1411
Mark Hammond91a681d2002-08-12 07:21:58 +00001412PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 PyObject *unicode, /* Unicode object */
1414 PyObject *mapping /* character mapping
1415 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001416 );
1417
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001418#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001419PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001420 const Py_UNICODE *data, /* Unicode char buffer */
1421 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1422 PyObject *mapping, /* character mapping
1423 (unicode ordinal -> char ordinal) */
1424 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001425 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001426#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001427
1428/* Translate a Py_UNICODE buffer of the given length by applying a
1429 character mapping table to it and return the resulting Unicode
1430 object.
1431
1432 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001434
1435 Mapping tables may be dictionaries or sequences. Unmapped character
1436 ordinals (ones which cause a LookupError) are left untouched and
1437 are copied as-is.
1438
1439*/
1440
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001441#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001442PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 const Py_UNICODE *data, /* Unicode char buffer */
1444 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1445 PyObject *table, /* Translate table */
1446 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001447 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001448#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001449
Victor Stinner99b95382011-07-04 14:23:54 +02001450#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001451
Guido van Rossumefec1152000-03-28 02:01:15 +00001452/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001453
Mark Hammond91a681d2002-08-12 07:21:58 +00001454PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001455 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001456 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001457 const char *errors /* error handling */
1458 );
1459
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001460PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1461 const char *string, /* MBCS encoded string */
1462 Py_ssize_t length, /* size of string */
1463 const char *errors, /* error handling */
1464 Py_ssize_t *consumed /* bytes consumed */
1465 );
1466
Mark Hammond91a681d2002-08-12 07:21:58 +00001467PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001468 PyObject *unicode /* Unicode object */
1469 );
1470
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001471#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001472PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001473 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001474 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001475 const char *errors /* error handling */
1476 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001477#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001478
Victor Stinner99b95382011-07-04 14:23:54 +02001479#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001480
Guido van Rossum9e896b32000-04-05 20:11:21 +00001481/* --- Decimal Encoder ---------------------------------------------------- */
1482
1483/* Takes a Unicode string holding a decimal value and writes it into
1484 an output buffer using standard ASCII digit codes.
1485
1486 The output buffer has to provide at least length+1 bytes of storage
1487 area. The output string is 0-terminated.
1488
1489 The encoder converts whitespace to ' ', decimal characters to their
1490 corresponding ASCII digit and all other Latin-1 characters except
1491 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1492 are treated as errors. This includes embedded NULL bytes.
1493
1494 Error handling is defined by the errors argument:
1495
1496 NULL or "strict": raise a ValueError
1497 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001498 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001499 "replace": replaces illegal characters with '?'
1500
1501 Returns 0 on success, -1 on failure.
1502
1503*/
1504
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001505#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001506PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 Py_UNICODE *s, /* Unicode buffer */
1508 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1509 char *output, /* Output buffer; must have size >= length */
1510 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001511 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001512#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001513
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001514/* Transforms code points that have decimal digit property to the
1515 corresponding ASCII digit code points.
1516
1517 Returns a new Unicode string on success, NULL on failure.
1518*/
1519
Georg Brandlb5503082010-12-05 11:40:48 +00001520#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001521PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1522 Py_UNICODE *s, /* Unicode buffer */
1523 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1524 );
Georg Brandlb5503082010-12-05 11:40:48 +00001525#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1528 as argument instead of a raw buffer and length. This function additionally
1529 transforms spaces to ASCII because this is what the callers in longobject,
1530 floatobject, and complexobject did anyways. */
1531
1532#ifndef Py_LIMITED_API
1533PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1534 PyObject *unicode /* Unicode object */
1535 );
1536#endif
1537
Martin v. Löwis011e8422009-05-05 04:43:17 +00001538/* --- File system encoding ---------------------------------------------- */
1539
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001540/* ParseTuple converter: encode str objects to bytes using
1541 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001542
1543PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1544
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001545/* ParseTuple converter: decode bytes objects to unicode using
1546 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1547
1548PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1549
Victor Stinner77c38622010-05-14 15:58:55 +00001550/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1551 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001552
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001553 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1554 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001555
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001556 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001557*/
1558
1559PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1560 const char *s /* encoded string */
1561 );
1562
Victor Stinner77c38622010-05-14 15:58:55 +00001563/* Decode a string using Py_FileSystemDefaultEncoding
1564 and the "surrogateescape" error handler.
1565
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001566 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1567 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001568*/
1569
Martin v. Löwis011e8422009-05-05 04:43:17 +00001570PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1571 const char *s, /* encoded string */
1572 Py_ssize_t size /* size */
1573 );
1574
Victor Stinnerae6265f2010-05-15 16:27:27 +00001575/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001576 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001577
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001578 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1579 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001580*/
1581
1582PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1583 PyObject *unicode
1584 );
1585
Guido van Rossumd8225182000-03-10 22:33:05 +00001586/* --- Methods & Slots ----------------------------------------------------
1587
1588 These are capable of handling Unicode objects and strings on input
1589 (we refer to them as strings in the descriptions) and return
1590 Unicode objects or integers as apporpriate. */
1591
1592/* Concat two strings giving a new Unicode string. */
1593
Mark Hammond91a681d2002-08-12 07:21:58 +00001594PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 PyObject *left, /* Left string */
1596 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001597 );
1598
Walter Dörwald1ab83302007-05-18 17:15:44 +00001599/* Concat two strings and put the result in *pleft
1600 (sets *pleft to NULL on error) */
1601
1602PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 PyObject **pleft, /* Pointer to left string */
1604 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001605 );
1606
1607/* Concat two strings, put the result in *pleft and drop the right object
1608 (sets *pleft to NULL on error) */
1609
1610PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001611 PyObject **pleft, /* Pointer to left string */
1612 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001613 );
1614
Guido van Rossumd8225182000-03-10 22:33:05 +00001615/* Split a string giving a list of Unicode strings.
1616
1617 If sep is NULL, splitting will be done at all whitespace
1618 substrings. Otherwise, splits occur at the given separator.
1619
1620 At most maxsplit splits will be done. If negative, no limit is set.
1621
1622 Separators are not included in the resulting list.
1623
1624*/
1625
Mark Hammond91a681d2002-08-12 07:21:58 +00001626PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 PyObject *s, /* String to split */
1628 PyObject *sep, /* String separator */
1629 Py_ssize_t maxsplit /* Maxsplit count */
1630 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001631
1632/* Dito, but split at line breaks.
1633
1634 CRLF is considered to be one line break. Line breaks are not
1635 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636
Mark Hammond91a681d2002-08-12 07:21:58 +00001637PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001638 PyObject *s, /* String to split */
1639 int keepends /* If true, line end markers are included */
1640 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001641
Thomas Wouters477c8d52006-05-27 19:21:47 +00001642/* Partition a string using a given separator. */
1643
1644PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 PyObject *s, /* String to partition */
1646 PyObject *sep /* String separator */
1647 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001648
1649/* Partition a string using a given separator, searching from the end of the
1650 string. */
1651
1652PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001653 PyObject *s, /* String to partition */
1654 PyObject *sep /* String separator */
1655 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001656
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001657/* Split a string giving a list of Unicode strings.
1658
1659 If sep is NULL, splitting will be done at all whitespace
1660 substrings. Otherwise, splits occur at the given separator.
1661
1662 At most maxsplit splits will be done. But unlike PyUnicode_Split
1663 PyUnicode_RSplit splits from the end of the string. If negative,
1664 no limit is set.
1665
1666 Separators are not included in the resulting list.
1667
1668*/
1669
1670PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 PyObject *s, /* String to split */
1672 PyObject *sep, /* String separator */
1673 Py_ssize_t maxsplit /* Maxsplit count */
1674 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001675
Guido van Rossumd8225182000-03-10 22:33:05 +00001676/* Translate a string by applying a character mapping table to it and
1677 return the resulting Unicode object.
1678
1679 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001681
1682 Mapping tables may be dictionaries or sequences. Unmapped character
1683 ordinals (ones which cause a LookupError) are left untouched and
1684 are copied as-is.
1685
1686*/
1687
Mark Hammond91a681d2002-08-12 07:21:58 +00001688PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 PyObject *str, /* String */
1690 PyObject *table, /* Translate table */
1691 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001692 );
1693
1694/* Join a sequence of strings using the given separator and return
1695 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696
Mark Hammond91a681d2002-08-12 07:21:58 +00001697PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 PyObject *separator, /* Separator string */
1699 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001700 );
1701
1702/* Return 1 if substr matches str[start:end] at the given tail end, 0
1703 otherwise. */
1704
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 PyObject *str, /* String */
1707 PyObject *substr, /* Prefix or Suffix string */
1708 Py_ssize_t start, /* Start index */
1709 Py_ssize_t end, /* Stop index */
1710 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001711 );
1712
1713/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001714 given search direction or -1 if not found. -2 is returned in case
1715 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001716
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 PyObject *str, /* String */
1719 PyObject *substr, /* Substring to find */
1720 Py_ssize_t start, /* Start index */
1721 Py_ssize_t end, /* Stop index */
1722 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001723 );
1724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725/* Like PyUnicode_Find, but search for single character only. */
1726PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1727 PyObject *str,
1728 Py_UCS4 ch,
1729 Py_ssize_t start,
1730 Py_ssize_t end,
1731 int direction
1732 );
1733
Barry Warsaw51ac5802000-03-20 16:36:48 +00001734/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001735
Martin v. Löwis18e16552006-02-15 17:27:45 +00001736PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 PyObject *str, /* String */
1738 PyObject *substr, /* Substring to count */
1739 Py_ssize_t start, /* Start index */
1740 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001741 );
1742
Barry Warsaw51ac5802000-03-20 16:36:48 +00001743/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001744 and return the resulting Unicode object. */
1745
Mark Hammond91a681d2002-08-12 07:21:58 +00001746PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 PyObject *str, /* String */
1748 PyObject *substr, /* Substring to find */
1749 PyObject *replstr, /* Substring to replace */
1750 Py_ssize_t maxcount /* Max. number of replacements to apply;
1751 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001752 );
1753
1754/* Compare two strings and return -1, 0, 1 for less than, equal,
1755 greater than resp. */
1756
Mark Hammond91a681d2002-08-12 07:21:58 +00001757PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 PyObject *left, /* Left string */
1759 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001760 );
1761
Martin v. Löwis5b222132007-06-10 09:51:05 +00001762PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1763 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001764 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001765 );
1766
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001767/* Rich compare two strings and return one of the following:
1768
1769 - NULL in case an exception was raised
1770 - Py_True or Py_False for successfuly comparisons
1771 - Py_NotImplemented in case the type combination is unknown
1772
1773 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1774 case the conversion of the arguments to Unicode fails with a
1775 UnicodeDecodeError.
1776
1777 Possible values for op:
1778
1779 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1780
1781*/
1782
1783PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001784 PyObject *left, /* Left string */
1785 PyObject *right, /* Right string */
1786 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001787 );
1788
Thomas Wouters7e474022000-07-16 12:04:32 +00001789/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001790 the resulting Unicode string. */
1791
Mark Hammond91a681d2002-08-12 07:21:58 +00001792PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001793 PyObject *format, /* Format string */
1794 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001795 );
1796
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001797/* Checks whether element is contained in container and return 1/0
1798 accordingly.
1799
1800 element has to coerce to an one element Unicode string. -1 is
1801 returned in case of an error. */
1802
Mark Hammond91a681d2002-08-12 07:21:58 +00001803PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001804 PyObject *container, /* Container string */
1805 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001806 );
1807
Martin v. Löwis47383402007-08-15 07:32:56 +00001808/* Checks whether argument is a valid identifier. */
1809
1810PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1811
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001812#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001813/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001814PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001815 PyUnicodeObject *self,
1816 int striptype,
1817 PyObject *sepobj
1818 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001819#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001820
Eric Smith5807c412008-05-11 21:00:57 +00001821/* Using the current locale, insert the thousands grouping
1822 into the string pointed to by buffer. For the argument descriptions,
1823 see Objects/stringlib/localeutil.h */
1824
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001825#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001826PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1827 Py_ssize_t n_buffer,
1828 Py_UNICODE *digits,
1829 Py_ssize_t n_digits,
1830 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001831#endif
Eric Smith5807c412008-05-11 21:00:57 +00001832
Eric Smitha3b1ac82009-04-03 14:45:06 +00001833/* Using explicit passed-in values, insert the thousands grouping
1834 into the string pointed to by buffer. For the argument descriptions,
1835 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001836#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1838 int kind,
1839 void *buffer,
1840 Py_ssize_t n_buffer,
1841 void *digits,
1842 Py_ssize_t n_digits,
1843 Py_ssize_t min_width,
1844 const char *grouping,
1845 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001846#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001847/* === Characters Type APIs =============================================== */
1848
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001849/* Helper array used by Py_UNICODE_ISSPACE(). */
1850
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001851#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001852PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1853
Guido van Rossumd8225182000-03-10 22:33:05 +00001854/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001855 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001856
1857 These APIs are implemented in Objects/unicodectype.c.
1858
1859*/
1860
Mark Hammond91a681d2002-08-12 07:21:58 +00001861PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001862 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001863 );
1864
Mark Hammond91a681d2002-08-12 07:21:58 +00001865PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001866 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001867 );
1868
Mark Hammond91a681d2002-08-12 07:21:58 +00001869PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001870 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001871 );
1872
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001873PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001874 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001875 );
1876
1877PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001878 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001879 );
1880
Mark Hammond91a681d2002-08-12 07:21:58 +00001881PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001882 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001883 );
1884
Mark Hammond91a681d2002-08-12 07:21:58 +00001885PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001886 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001887 );
1888
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001889PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1890 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001891 );
1892
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001893PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1894 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001895 );
1896
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001897PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1898 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001899 );
1900
Mark Hammond91a681d2002-08-12 07:21:58 +00001901PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001902 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001903 );
1904
Mark Hammond91a681d2002-08-12 07:21:58 +00001905PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001906 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001907 );
1908
Mark Hammond91a681d2002-08-12 07:21:58 +00001909PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001910 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001911 );
1912
Mark Hammond91a681d2002-08-12 07:21:58 +00001913PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001914 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001915 );
1916
Mark Hammond91a681d2002-08-12 07:21:58 +00001917PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001918 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001919 );
1920
Mark Hammond91a681d2002-08-12 07:21:58 +00001921PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001922 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001923 );
1924
Georg Brandl559e5d72008-06-11 18:37:52 +00001925PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001926 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001927 );
1928
Mark Hammond91a681d2002-08-12 07:21:58 +00001929PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001930 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001931 );
1932
Victor Stinneref8d95c2010-08-16 22:03:11 +00001933PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1934 const Py_UNICODE *u
1935 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001936
1937PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001938 Py_UNICODE *s1,
1939 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001940
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001941PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1942 Py_UNICODE *s1, const Py_UNICODE *s2);
1943
Martin v. Löwis5b222132007-06-10 09:51:05 +00001944PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001945 Py_UNICODE *s1,
1946 const Py_UNICODE *s2,
1947 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001948
1949PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001950 const Py_UNICODE *s1,
1951 const Py_UNICODE *s2
1952 );
1953
1954PyAPI_FUNC(int) Py_UNICODE_strncmp(
1955 const Py_UNICODE *s1,
1956 const Py_UNICODE *s2,
1957 size_t n
1958 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001959
1960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001961 const Py_UNICODE *s,
1962 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001963 );
1964
Victor Stinner331ea922010-08-10 16:37:20 +00001965PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001966 const Py_UNICODE *s,
1967 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001968 );
1969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970PyAPI_FUNC(size_t) Py_UCS4_strlen(
1971 const Py_UCS4 *u
1972 );
1973
1974PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1975 Py_UCS4 *s1,
1976 const Py_UCS4 *s2);
1977
1978PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1979 Py_UCS4 *s1, const Py_UCS4 *s2);
1980
1981PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1982 Py_UCS4 *s1,
1983 const Py_UCS4 *s2,
1984 size_t n);
1985
1986PyAPI_FUNC(int) Py_UCS4_strcmp(
1987 const Py_UCS4 *s1,
1988 const Py_UCS4 *s2
1989 );
1990
1991PyAPI_FUNC(int) Py_UCS4_strncmp(
1992 const Py_UCS4 *s1,
1993 const Py_UCS4 *s2,
1994 size_t n
1995 );
1996
1997PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1998 const Py_UCS4 *s,
1999 Py_UCS4 c
2000 );
2001
2002PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2003 const Py_UCS4 *s,
2004 Py_UCS4 c
2005 );
2006
Victor Stinner71133ff2010-09-01 23:43:53 +00002007/* Create a copy of a unicode string ending with a nul character. Return NULL
2008 and raise a MemoryError exception on memory allocation failure, otherwise
2009 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2010
Victor Stinner46408602010-09-03 16:18:00 +00002011PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002012 PyObject *unicode
2013 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002014#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002015
Guido van Rossumd8225182000-03-10 22:33:05 +00002016#ifdef __cplusplus
2017}
2018#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002019#endif /* !Py_UNICODEOBJECT_H */