blob: f6b105a8d8967e036551cedd1f81f2369cf7406c [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200209 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200291 - PyUnicode_WCHAR_KIND (0):
292
293 * character type = wchar_t (16 or 32 bits, depending on the
294 platform)
295
296 - PyUnicode_1BYTE_KIND (1):
297
298 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200299 * if ascii is set, all characters must be in range
300 U+0000-U+007F, otherwise at least one character must be in range
301 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302
303 - PyUnicode_2BYTE_KIND (2):
304
305 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200306 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200307
308 - PyUnicode_4BYTE_KIND (3):
309
310 * character type = Py_UCS4 (32 bits, unsigned)
311 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 */
313 unsigned int kind:2;
314 /* Compact is with respect to the allocation scheme. Compact unicode
315 objects only require one memory block while non-compact objects use
316 one block for the PyUnicodeObject struct and another for its data
317 buffer. */
318 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200319 /* The string only contains characters in range U+0000-U+007F (ASCII)
320 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
321 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 unsigned int ascii:1;
323 /* The ready flag indicates whether the object layout is initialized
324 completely. This means that this is either a compact object, or
325 the data pointer is filled out. The bit is redundant, and helps
326 to minimize the test in PyUnicode_IS_READY(). */
327 unsigned int ready:1;
328 } state;
329 wchar_t *wstr; /* wchar_t representation (null-terminated) */
330} PyASCIIObject;
331
332/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200333 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 immediately follow the structure. */
335typedef struct {
336 PyASCIIObject _base;
337 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
338 * terminating \0. */
339 char *utf8; /* UTF-8 representation (null-terminated) */
340 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
341 * surrogates count as two code points. */
342} PyCompactUnicodeObject;
343
344/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
345 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200346 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347typedef struct {
348 PyCompactUnicodeObject _base;
349 union {
350 void *any;
351 Py_UCS1 *latin1;
352 Py_UCS2 *ucs2;
353 Py_UCS4 *ucs4;
354 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000355} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000356#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000357
Mark Hammond91a681d2002-08-12 07:21:58 +0000358PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000359PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000360
Thomas Wouters27d517b2007-02-25 20:39:11 +0000361#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000362 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
363#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
365/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000366#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367
368#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200370 ((PyASCIIObject*)op)->length : \
371 ((PyCompactUnicodeObject*)op)->wstr_length)
372
373/* Returns the deprecated Py_UNICODE representation's size in code units
374 (this includes surrogate pairs as 2 units).
375 If the Py_UNICODE representation is not available, it will be computed
376 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
377
Guido van Rossumd8225182000-03-10 22:33:05 +0000378#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200379 (assert(PyUnicode_Check(op)), \
380 (((PyASCIIObject *)(op))->wstr) ? \
381 PyUnicode_WSTR_LENGTH(op) : \
382 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
383 PyUnicode_WSTR_LENGTH(op)))
384
Guido van Rossumd8225182000-03-10 22:33:05 +0000385#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
387
388/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
389 representation on demand. Using this macro is very inefficient now,
390 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
391 use PyUnicode_WRITE() and PyUnicode_READ(). */
392
Guido van Rossumd8225182000-03-10 22:33:05 +0000393#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200394 (assert(PyUnicode_Check(op)), \
395 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
396 PyUnicode_AsUnicode((PyObject *)(op)))
397
Guido van Rossumd8225182000-03-10 22:33:05 +0000398#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399 ((const char *)(PyUnicode_AS_UNICODE(op)))
400
401
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200402/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
404/* Values for PyUnicodeObject.state: */
405
406/* Interning state. */
407#define SSTATE_NOT_INTERNED 0
408#define SSTATE_INTERNED_MORTAL 1
409#define SSTATE_INTERNED_IMMORTAL 2
410
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411/* Return true if the string contains only ASCII characters, or 0 if not. The
412 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
413 or Ready calls are performed. */
414#define PyUnicode_IS_ASCII(op) \
415 (((PyASCIIObject*)op)->state.ascii)
416
417/* Return true if the string is compact or 0 if not.
418 No type checks or Ready calls are performed. */
419#define PyUnicode_IS_COMPACT(op) \
420 (((PyASCIIObject*)(op))->state.compact)
421
422/* Return true if the string is a compact ASCII string (use PyASCIIObject
423 structure), or 0 if not. No type checks or Ready calls are performed. */
424#define PyUnicode_IS_COMPACT_ASCII(op) \
425 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200426
427/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200428 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200429 has not been called yet. */
430#define PyUnicode_WCHAR_KIND 0
431
432/* Return values of the PyUnicode_KIND() macro: */
433
434#define PyUnicode_1BYTE_KIND 1
435#define PyUnicode_2BYTE_KIND 2
436#define PyUnicode_4BYTE_KIND 3
437
438
439/* Return the number of bytes the string uses to represent single characters,
Victor Stinner4584a5b2011-10-01 02:39:37 +0200440 this can be 1, 2 or 4.
441
442 See also PyUnicode_KIND_SIZE(). */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443#define PyUnicode_CHARACTER_SIZE(op) \
Victor Stinnerb066cc62011-10-06 15:54:53 +0200444 (((Py_ssize_t)1 << (PyUnicode_KIND(op) - 1)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445
Georg Brandl4975a9b2011-10-05 16:12:21 +0200446/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 Py_UCS2, or Py_UCS4 for direct character access.
448 No checks are performed, use PyUnicode_CHARACTER_SIZE or
449 PyUnicode_KIND() before to ensure these will work correctly. */
450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define PyUnicode_KIND(op) \
457 (assert(PyUnicode_Check(op)), \
458 assert(PyUnicode_IS_READY(op)), \
459 ((PyASCIIObject *)(op))->state.kind)
460
Victor Stinner157f83f2011-09-28 21:41:31 +0200461/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462#define _PyUnicode_COMPACT_DATA(op) \
463 (PyUnicode_IS_COMPACT_ASCII(op) ? \
464 ((void*)((PyASCIIObject*)(op) + 1)) : \
465 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op) \
468 (assert(((PyUnicodeObject*)(op))->data.any), \
469 ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472 (assert(PyUnicode_Check(op)), \
473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
474 _PyUnicode_NONCOMPACT_DATA(op))
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
Victor Stinner4584a5b2011-10-01 02:39:37 +0200477 The index is a character index, the result is a size in bytes.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478
Victor Stinner4584a5b2011-10-01 02:39:37 +0200479 See also PyUnicode_CHARACTER_SIZE(). */
Antoine Pitroudbf697a2011-10-06 15:34:41 +0200480#define PyUnicode_KIND_SIZE(kind, index) \
Victor Stinnerb066cc62011-10-06 15:54:53 +0200481 (((Py_ssize_t)(index)) << ((kind) - 1))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200482
483/* In the access macros below, "kind" may be evaluated more than once.
484 All other macro parameters are evaluated exactly once, so it is safe
485 to put side effects into them (such as increasing the index). */
486
487/* Write into the canonical representation, this macro does not do any sanity
488 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200489 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200491 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492#define PyUnicode_WRITE(kind, data, index, value) \
493 do { \
494 switch ((kind)) { \
495 case PyUnicode_1BYTE_KIND: { \
496 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
497 break; \
498 } \
499 case PyUnicode_2BYTE_KIND: { \
500 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
501 break; \
502 } \
503 default: { \
504 assert((kind) == PyUnicode_4BYTE_KIND); \
505 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
506 } \
507 } \
508 } while (0)
509
Georg Brandl07de3252011-10-05 16:47:38 +0200510/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 or ready calls are performed. */
512#define PyUnicode_READ(kind, data, index) \
513 ((Py_UCS4) \
514 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200515 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200516 ((kind) == PyUnicode_2BYTE_KIND ? \
517 ((const Py_UCS2 *)(data))[(index)] : \
518 ((const Py_UCS4 *)(data))[(index)] \
519 ) \
520 ))
521
522/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
523 calls PyUnicode_KIND() and might call it twice. For single reads, use
524 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
525 cache kind and use PyUnicode_READ instead. */
526#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200527 (assert(PyUnicode_Check(unicode)), \
528 assert(PyUnicode_IS_READY(unicode)), \
529 (Py_UCS4) \
530 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
531 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
532 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
533 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
534 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
535 ) \
536 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537
538/* Returns the length of the unicode string. The caller has to make sure that
539 the string has it's canonical representation set before calling
540 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
541#define PyUnicode_GET_LENGTH(op) \
542 (assert(PyUnicode_Check(op)), \
543 assert(PyUnicode_IS_READY(op)), \
544 ((PyASCIIObject *)(op))->length)
545
546
547/* Fast check to determine whether an object is ready. Equivalent to
548 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
549
550#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
551
Victor Stinnera3b334d2011-10-03 13:53:37 +0200552/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200554 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 Returns 0 on success and -1 on errors. */
556#define PyUnicode_READY(op) \
557 (assert(PyUnicode_Check(op)), \
558 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200559 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561/* Return a maximum character value which is suitable for creating another
562 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200563 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#define PyUnicode_MAX_CHAR_VALUE(op) \
565 (assert(PyUnicode_IS_READY(op)), \
566 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
567 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
568 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
569 (0x7fU) : (0xffU) \
570 ) : \
571 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
572 (0xffffU) : (0x10ffffU) \
573 ))))
574
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000575#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000576
577/* --- Constants ---------------------------------------------------------- */
578
579/* This Unicode character will be used as replacement character during
580 decoding if the errors argument is set to "replace". Note: the
581 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
582 Unicode 3.0. */
583
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200584#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000585
586/* === Public API ========================================================= */
587
588/* --- Plain Py_UNICODE --------------------------------------------------- */
589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590/* With PEP 393, this is the recommended way to allocate a new unicode object.
591 This function will allocate the object and its buffer in a single memory
592 block. Objects created using this function are not resizable. */
593#ifndef Py_LIMITED_API
594PyAPI_FUNC(PyObject*) PyUnicode_New(
595 Py_ssize_t size, /* Number of code points in the new string */
596 Py_UCS4 maxchar /* maximum code point value in the string */
597 );
598#endif
599
Victor Stinnerd8f65102011-09-29 19:43:17 +0200600/* Initializes the canonical string representation from a the deprecated
601 wstr/Py_UNICODE representation. This function is used to convert Unicode
602 objects which were created using the old API to the new flexible format
603 introduced with PEP 393.
604
605 Don't call this function directly, use the public PyUnicode_READY() macro
606 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607#ifndef Py_LIMITED_API
608PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200609 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 );
611#endif
612
Victor Stinner034f6cf2011-09-30 02:26:44 +0200613/* Get a copy of a Unicode string. */
614PyAPI_FUNC(PyObject*) PyUnicode_Copy(
615 PyObject *unicode
616 );
617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200619 character conversion when necessary and falls back to memcpy if possible.
620
Victor Stinnera0702ab2011-09-29 14:14:38 +0200621 Fail if to is too small (smaller than how_many or smaller than
622 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
623 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200624
625 Return the number of written character, or return -1 and raise an exception
626 on error.
627
628 Pseudo-code:
629
630 how_many = min(how_many, len(from) - from_start)
631 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
632 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200633
634 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200635 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200636#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200637PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638 PyObject *to,
639 Py_ssize_t to_start,
640 PyObject *from,
641 Py_ssize_t from_start,
642 Py_ssize_t how_many
643 );
644#endif
645
Guido van Rossumd8225182000-03-10 22:33:05 +0000646/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000648
649 u may be NULL which causes the contents to be undefined. It is the
650 user's responsibility to fill in the needed data afterwards. Note
651 that modifying the Unicode object contents after construction is
652 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000653
654 The buffer is copied into the new object. */
655
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000656#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000657PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000658 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000659 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000660 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000661#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000662
Georg Brandl952867a2010-06-27 10:17:12 +0000663/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000664PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000665 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000666 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000667 );
668
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000669/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000671PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000672 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000673 );
674
Victor Stinnerb9275c12011-10-05 14:01:42 +0200675/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
676 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677#ifndef Py_LIMITED_API
678PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
679 int kind,
680 const void *buffer,
681 Py_ssize_t size);
682#endif
683
684PyAPI_FUNC(PyObject*) PyUnicode_Substring(
685 PyObject *str,
686 Py_ssize_t start,
687 Py_ssize_t end);
688
689/* Copy the string into a UCS4 buffer including the null character is copy_null
690 is set. Return NULL and raise an exception on error. Raise a ValueError if
691 the buffer is smaller than the string. Return buffer on success.
692
693 buflen is the length of the buffer in (Py_UCS4) characters. */
694PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
695 PyObject *unicode,
696 Py_UCS4* buffer,
697 Py_ssize_t buflen,
698 int copy_null);
699
700/* Copy the string into a UCS4 buffer. A new buffer is allocated using
701 * PyMem_Malloc; if this fails, NULL is returned with a memory error
702 exception set. */
703PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
704
Guido van Rossumd8225182000-03-10 22:33:05 +0000705/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200706 Py_UNICODE buffer.
707 If the wchar_t/Py_UNICODE representation is not yet available, this
708 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000709
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000710#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000711PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000713 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000714#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716/* Return a read-only pointer to the Unicode object's internal
717 Py_UNICODE buffer and save the length at size.
718 If the wchar_t/Py_UNICODE representation is not yet available, this
719 function will calculate it. */
720
721#ifndef Py_LIMITED_API
722PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
723 PyObject *unicode, /* Unicode object */
724 Py_ssize_t *size /* location where to save the length */
725 );
726#endif
727
Guido van Rossumd8225182000-03-10 22:33:05 +0000728/* Get the length of the Unicode object. */
729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
731 PyObject *unicode
732);
733
Victor Stinner157f83f2011-09-28 21:41:31 +0200734/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735 string representation. */
736
Martin v. Löwis18e16552006-02-15 17:27:45 +0000737PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000739 );
740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741/* Read a character from the string. */
742
743PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
744 PyObject *unicode,
745 Py_ssize_t index
746 );
747
748/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200749 PyUnicode_New, must not be shared, and must not have been hashed yet.
750
751 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200752
753PyAPI_FUNC(int) PyUnicode_WriteChar(
754 PyObject *unicode,
755 Py_ssize_t index,
756 Py_UCS4 character
757 );
758
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000759#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000760/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000761PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000762#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000763
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200764/* Resize an Unicode object allocated by the legacy API (e.g.
765 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
766 PyUnicode_New) cannot be resized by this function.
767
768 The length is a number of Py_UNICODE characters (and not the number of code
769 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000770
771 *unicode is modified to point to the new (resized) object and 0
772 returned on success.
773
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200774 If the refcount on the object is 1, the function resizes the string in
775 place, which is usually faster than allocating a new string (and copy
776 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000777
778 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200779 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000780
Mark Hammond91a681d2002-08-12 07:21:58 +0000781PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 PyObject **unicode, /* Pointer to the Unicode object */
783 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000784 );
785
Guido van Rossumd8225182000-03-10 22:33:05 +0000786/* Coerce obj to an Unicode object and return a reference with
787 *incremented* refcount.
788
789 Coercion is done in the following way:
790
Georg Brandl952867a2010-06-27 10:17:12 +0000791 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000792 under the assumptions that they contain data using the UTF-8
793 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000794
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000795 2. All other objects (including Unicode objects) raise an
796 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000797
798 The API returns NULL in case of an error. The caller is responsible
799 for decref'ing the returned objects.
800
801*/
802
Mark Hammond91a681d2002-08-12 07:21:58 +0000803PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000805 const char *encoding, /* encoding */
806 const char *errors /* error handling */
807 );
808
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000809/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000810 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000812 Unicode objects are passed back as-is (subclasses are converted to
813 true Unicode objects), all other objects are delegated to
814 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000815 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000816
817 The API returns NULL in case of an error. The caller is responsible
818 for decref'ing the returned objects.
819
820*/
821
Mark Hammond91a681d2002-08-12 07:21:58 +0000822PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000824 );
825
Victor Stinner1205f272010-09-11 00:54:47 +0000826PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
827 const char *format, /* ASCII-encoded string */
828 va_list vargs
829 );
830PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
831 const char *format, /* ASCII-encoded string */
832 ...
833 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000834
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000836/* Format the object based on the format_spec, as defined in PEP 3101
837 (Advanced String Formatting). */
838PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 PyObject *format_spec,
840 Py_ssize_t start,
841 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000842#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000843
Walter Dörwald16807132007-05-25 13:52:07 +0000844PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
845PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000846PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
847 const char *u /* UTF-8 encoded string */
848 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000849#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000850PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000851#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000852
853/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854#define PyUnicode_CHECK_INTERNED(op) \
855 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000856
Guido van Rossumd8225182000-03-10 22:33:05 +0000857/* --- wchar_t support for platforms which support it --------------------- */
858
859#ifdef HAVE_WCHAR_H
860
Georg Brandl952867a2010-06-27 10:17:12 +0000861/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000862 size.
863
864 The buffer is copied into the new object. */
865
Mark Hammond91a681d2002-08-12 07:21:58 +0000866PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000868 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000869 );
870
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000871/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000872 most size wchar_t characters are copied.
873
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000874 Note that the resulting wchar_t string may or may not be
875 0-terminated. It is the responsibility of the caller to make sure
876 that the wchar_t string is 0-terminated in case this is required by
877 the application.
878
879 Returns the number of wchar_t characters copied (excluding a
880 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000881 error. */
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000884 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000885 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000886 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000887 );
888
Victor Stinner137c34c2010-09-29 10:25:54 +0000889/* Convert the Unicode object to a wide character string. The output string
890 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200891 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000892
893 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
894 on success. On error, returns NULL, *size is undefined and raises a
895 MemoryError. */
896
897PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000898 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000899 Py_ssize_t *size /* number of characters of the result */
900 );
901
Victor Stinner9f789e72011-10-01 03:57:28 +0200902#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200904#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905
Guido van Rossumd8225182000-03-10 22:33:05 +0000906#endif
907
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000908/* --- Unicode ordinals --------------------------------------------------- */
909
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910/* Create a Unicode Object from the given Unicode code point ordinal.
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912 The ordinal must be in range(0x10000) on narrow Python builds
913 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
914 raised in case it is not.
915
916*/
917
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000918PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000920/* --- Free-list management ----------------------------------------------- */
921
922/* Clear the free list used by the Unicode implementation.
923
924 This can be used to release memory used for objects on the free
925 list back to the Python memory allocator.
926
927*/
928
929PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
930
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000932
933 Many of these APIs take two arguments encoding and errors. These
934 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000935 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000936
Georg Brandl952867a2010-06-27 10:17:12 +0000937 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000938
939 Error handling is set by errors which may also be set to NULL
940 meaning to use the default handling defined for the codec. Default
941 error handling for all builtin codecs is "strict" (ValueErrors are
942 raised).
943
944 The codecs all use a similar interface. Only deviation from the
945 generic ones are documented.
946
947*/
948
Fred Drakecb093fe2000-05-09 19:51:53 +0000949/* --- Manage the default encoding ---------------------------------------- */
950
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000951/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000952 Unicode object unicode and the size of the encoded representation
953 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000954
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000955 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000956
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200957 This function caches the UTF-8 encoded string in the unicodeobject
958 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 when the unicodeobject is deallocated.
960
961 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
962 support the previous internal function with the same behaviour.
963
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000964 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000965 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000966
967 *** If you need to access the Unicode object as UTF-8 bytes string,
968 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000969*/
970
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000971#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000973 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000974 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000976#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000977
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000978/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000979 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
982 in the unicodeobject.
983
984 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
985 support the previous internal function with the same behaviour.
986
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000987 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000988 extracted from the returned data.
989
990 *** This API is for interpreter INTERNAL USE ONLY and will likely
991 *** be removed or changed for Python 3.1.
992
993 *** If you need to access the Unicode object as UTF-8 bytes string,
994 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000995
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000996*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000997
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000998#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1000#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001001#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001002
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001003/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001004
Mark Hammond91a681d2002-08-12 07:21:58 +00001005PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001006
Guido van Rossumd8225182000-03-10 22:33:05 +00001007/* --- Generic Codecs ----------------------------------------------------- */
1008
1009/* Create a Unicode object by decoding the encoded string s of the
1010 given size. */
1011
Mark Hammond91a681d2002-08-12 07:21:58 +00001012PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001013 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001014 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001015 const char *encoding, /* encoding */
1016 const char *errors /* error handling */
1017 );
1018
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001019/* Decode a Unicode object unicode and return the result as Python
1020 object. */
1021
1022PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 PyObject *unicode, /* Unicode object */
1024 const char *encoding, /* encoding */
1025 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001026 );
1027
1028/* Decode a Unicode object unicode and return the result as Unicode
1029 object. */
1030
1031PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 PyObject *unicode, /* Unicode object */
1033 const char *encoding, /* encoding */
1034 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001035 );
1036
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001038 Python string object. */
1039
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001040#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001041PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001043 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001044 const char *encoding, /* encoding */
1045 const char *errors /* error handling */
1046 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001047#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001048
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001049/* Encodes a Unicode object and returns the result as Python
1050 object. */
1051
1052PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053 PyObject *unicode, /* Unicode object */
1054 const char *encoding, /* encoding */
1055 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001056 );
1057
Guido van Rossumd8225182000-03-10 22:33:05 +00001058/* Encodes a Unicode object and returns the result as Python string
1059 object. */
1060
Mark Hammond91a681d2002-08-12 07:21:58 +00001061PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 PyObject *unicode, /* Unicode object */
1063 const char *encoding, /* encoding */
1064 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001065 );
1066
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001067/* Encodes a Unicode object and returns the result as Unicode
1068 object. */
1069
1070PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 PyObject *unicode, /* Unicode object */
1072 const char *encoding, /* encoding */
1073 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001074 );
1075
1076/* Build an encoding map. */
1077
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001078PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1079 PyObject* string /* 256 character map */
1080 );
1081
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082/* --- UTF-7 Codecs ------------------------------------------------------- */
1083
Mark Hammond91a681d2002-08-12 07:21:58 +00001084PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 const char *string, /* UTF-7 encoded string */
1086 Py_ssize_t length, /* size of string */
1087 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088 );
1089
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001090PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 const char *string, /* UTF-7 encoded string */
1092 Py_ssize_t length, /* size of string */
1093 const char *errors, /* error handling */
1094 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001095 );
1096
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001097#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001098PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 const Py_UNICODE *data, /* Unicode char buffer */
1100 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1101 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1102 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1103 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001104 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001105#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001106
Guido van Rossumd8225182000-03-10 22:33:05 +00001107/* --- UTF-8 Codecs ------------------------------------------------------- */
1108
Mark Hammond91a681d2002-08-12 07:21:58 +00001109PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 const char *string, /* UTF-8 encoded string */
1111 Py_ssize_t length, /* size of string */
1112 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001113 );
1114
Walter Dörwald69652032004-09-07 20:24:22 +00001115PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 const char *string, /* UTF-8 encoded string */
1117 Py_ssize_t length, /* size of string */
1118 const char *errors, /* error handling */
1119 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001120 );
1121
Mark Hammond91a681d2002-08-12 07:21:58 +00001122PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001124 );
1125
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001126#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1128 PyObject *unicode,
1129 const char *errors);
1130
Mark Hammond91a681d2002-08-12 07:21:58 +00001131PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001132 const Py_UNICODE *data, /* Unicode char buffer */
1133 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1134 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001135 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001136#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001137
Walter Dörwald41980ca2007-08-16 21:55:45 +00001138/* --- UTF-32 Codecs ------------------------------------------------------ */
1139
1140/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1141 the corresponding Unicode object.
1142
1143 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001145
1146 If byteorder is non-NULL, the decoder starts decoding using the
1147 given byte order:
1148
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 *byteorder == -1: little endian
1150 *byteorder == 0: native order
1151 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001152
1153 In native mode, the first four bytes of the stream are checked for a
1154 BOM mark. If found, the BOM mark is analysed, the byte order
1155 adjusted and the BOM skipped. In the other modes, no BOM mark
1156 interpretation is done. After completion, *byteorder is set to the
1157 current byte order at the end of input data.
1158
1159 If byteorder is NULL, the codec starts in native order mode.
1160
1161*/
1162
1163PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 const char *string, /* UTF-32 encoded string */
1165 Py_ssize_t length, /* size of string */
1166 const char *errors, /* error handling */
1167 int *byteorder /* pointer to byteorder to use
1168 0=native;-1=LE,1=BE; updated on
1169 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001170 );
1171
1172PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 const char *string, /* UTF-32 encoded string */
1174 Py_ssize_t length, /* size of string */
1175 const char *errors, /* error handling */
1176 int *byteorder, /* pointer to byteorder to use
1177 0=native;-1=LE,1=BE; updated on
1178 exit */
1179 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001180 );
1181
1182/* Returns a Python string using the UTF-32 encoding in native byte
1183 order. The string always starts with a BOM mark. */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001187 );
1188
1189/* Returns a Python string object holding the UTF-32 encoded value of
1190 the Unicode data.
1191
1192 If byteorder is not 0, output is written according to the following
1193 byte order:
1194
1195 byteorder == -1: little endian
1196 byteorder == 0: native byte order (writes a BOM mark)
1197 byteorder == 1: big endian
1198
1199 If byteorder is 0, the output string will always start with the
1200 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1201 prepended.
1202
1203*/
1204
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001205#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001206PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 const Py_UNICODE *data, /* Unicode char buffer */
1208 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1209 const char *errors, /* error handling */
1210 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001211 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001212#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001213
Guido van Rossumd8225182000-03-10 22:33:05 +00001214/* --- UTF-16 Codecs ------------------------------------------------------ */
1215
Guido van Rossum9e896b32000-04-05 20:11:21 +00001216/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001217 the corresponding Unicode object.
1218
1219 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001221
1222 If byteorder is non-NULL, the decoder starts decoding using the
1223 given byte order:
1224
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 *byteorder == -1: little endian
1226 *byteorder == 0: native order
1227 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001228
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001229 In native mode, the first two bytes of the stream are checked for a
1230 BOM mark. If found, the BOM mark is analysed, the byte order
1231 adjusted and the BOM skipped. In the other modes, no BOM mark
1232 interpretation is done. After completion, *byteorder is set to the
1233 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001234
1235 If byteorder is NULL, the codec starts in native order mode.
1236
1237*/
1238
Mark Hammond91a681d2002-08-12 07:21:58 +00001239PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 const char *string, /* UTF-16 encoded string */
1241 Py_ssize_t length, /* size of string */
1242 const char *errors, /* error handling */
1243 int *byteorder /* pointer to byteorder to use
1244 0=native;-1=LE,1=BE; updated on
1245 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001246 );
1247
Walter Dörwald69652032004-09-07 20:24:22 +00001248PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 const char *string, /* UTF-16 encoded string */
1250 Py_ssize_t length, /* size of string */
1251 const char *errors, /* error handling */
1252 int *byteorder, /* pointer to byteorder to use
1253 0=native;-1=LE,1=BE; updated on
1254 exit */
1255 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001256 );
1257
Guido van Rossumd8225182000-03-10 22:33:05 +00001258/* Returns a Python string using the UTF-16 encoding in native byte
1259 order. The string always starts with a BOM mark. */
1260
Mark Hammond91a681d2002-08-12 07:21:58 +00001261PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001262 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001263 );
1264
1265/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001266 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001267
1268 If byteorder is not 0, output is written according to the following
1269 byte order:
1270
1271 byteorder == -1: little endian
1272 byteorder == 0: native byte order (writes a BOM mark)
1273 byteorder == 1: big endian
1274
1275 If byteorder is 0, the output string will always start with the
1276 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1277 prepended.
1278
1279 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1280 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001281 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001282
1283*/
1284
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001286PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 const Py_UNICODE *data, /* Unicode char buffer */
1288 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1289 const char *errors, /* error handling */
1290 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001291 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001292#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001293
1294/* --- Unicode-Escape Codecs ---------------------------------------------- */
1295
Mark Hammond91a681d2002-08-12 07:21:58 +00001296PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001297 const char *string, /* Unicode-Escape encoded string */
1298 Py_ssize_t length, /* size of string */
1299 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001300 );
1301
Mark Hammond91a681d2002-08-12 07:21:58 +00001302PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001304 );
1305
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001307PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 const Py_UNICODE *data, /* Unicode char buffer */
1309 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001310 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001311#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001312
1313/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1314
Mark Hammond91a681d2002-08-12 07:21:58 +00001315PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 const char *string, /* Raw-Unicode-Escape encoded string */
1317 Py_ssize_t length, /* size of string */
1318 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001319 );
1320
Mark Hammond91a681d2002-08-12 07:21:58 +00001321PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001323 );
1324
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001325#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001326PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001327 const Py_UNICODE *data, /* Unicode char buffer */
1328 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001329 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001330#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001331
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001332/* --- Unicode Internal Codec ---------------------------------------------
1333
1334 Only for internal use in _codecsmodule.c */
1335
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001336#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001337PyObject *_PyUnicode_DecodeUnicodeInternal(
1338 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001339 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001340 const char *errors
1341 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001343
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001345
1346 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1347
1348*/
1349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 const char *string, /* Latin-1 encoded string */
1352 Py_ssize_t length, /* size of string */
1353 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001354 );
1355
Mark Hammond91a681d2002-08-12 07:21:58 +00001356PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001358 );
1359
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001360#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1362 PyObject* unicode,
1363 const char* errors);
1364
Mark Hammond91a681d2002-08-12 07:21:58 +00001365PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 const Py_UNICODE *data, /* Unicode char buffer */
1367 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1368 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001369 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001370#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001371
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001373
1374 Only 7-bit ASCII data is excepted. All other codes generate errors.
1375
1376*/
1377
Mark Hammond91a681d2002-08-12 07:21:58 +00001378PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 const char *string, /* ASCII encoded string */
1380 Py_ssize_t length, /* size of string */
1381 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001382 );
1383
Mark Hammond91a681d2002-08-12 07:21:58 +00001384PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001386 );
1387
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001388#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1390 PyObject* unicode,
1391 const char* errors);
1392
Mark Hammond91a681d2002-08-12 07:21:58 +00001393PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 const Py_UNICODE *data, /* Unicode char buffer */
1395 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1396 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001397 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001398#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001399
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001403
1404 Decoding mappings must map single string characters to single
1405 Unicode characters, integers (which are then interpreted as Unicode
1406 ordinals) or None (meaning "undefined mapping" and causing an
1407 error).
1408
1409 Encoding mappings must map single Unicode characters to single
1410 string characters, integers (which are then interpreted as Latin-1
1411 ordinals) or None (meaning "undefined mapping" and causing an
1412 error).
1413
1414 If a character lookup fails with a LookupError, the character is
1415 copied as-is meaning that its ordinal value will be interpreted as
1416 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1417 to contain those mappings which map characters to different code
1418 points.
1419
1420*/
1421
Mark Hammond91a681d2002-08-12 07:21:58 +00001422PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 const char *string, /* Encoded string */
1424 Py_ssize_t length, /* size of string */
1425 PyObject *mapping, /* character mapping
1426 (char ordinal -> unicode ordinal) */
1427 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001428 );
1429
Mark Hammond91a681d2002-08-12 07:21:58 +00001430PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 PyObject *unicode, /* Unicode object */
1432 PyObject *mapping /* character mapping
1433 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001434 );
1435
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001436#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001437PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 const Py_UNICODE *data, /* Unicode char buffer */
1439 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1440 PyObject *mapping, /* character mapping
1441 (unicode ordinal -> char ordinal) */
1442 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001443 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001444#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001445
1446/* Translate a Py_UNICODE buffer of the given length by applying a
1447 character mapping table to it and return the resulting Unicode
1448 object.
1449
1450 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001452
1453 Mapping tables may be dictionaries or sequences. Unmapped character
1454 ordinals (ones which cause a LookupError) are left untouched and
1455 are copied as-is.
1456
1457*/
1458
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001459#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001460PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 const Py_UNICODE *data, /* Unicode char buffer */
1462 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1463 PyObject *table, /* Translate table */
1464 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001465 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001466#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001467
Victor Stinner99b95382011-07-04 14:23:54 +02001468#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001469
Guido van Rossumefec1152000-03-28 02:01:15 +00001470/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001471
Mark Hammond91a681d2002-08-12 07:21:58 +00001472PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001473 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001475 const char *errors /* error handling */
1476 );
1477
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001478PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1479 const char *string, /* MBCS encoded string */
1480 Py_ssize_t length, /* size of string */
1481 const char *errors, /* error handling */
1482 Py_ssize_t *consumed /* bytes consumed */
1483 );
1484
Mark Hammond91a681d2002-08-12 07:21:58 +00001485PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001486 PyObject *unicode /* Unicode object */
1487 );
1488
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001489#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001490PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001491 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001492 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001493 const char *errors /* error handling */
1494 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001496
Victor Stinner99b95382011-07-04 14:23:54 +02001497#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001498
Guido van Rossum9e896b32000-04-05 20:11:21 +00001499/* --- Decimal Encoder ---------------------------------------------------- */
1500
1501/* Takes a Unicode string holding a decimal value and writes it into
1502 an output buffer using standard ASCII digit codes.
1503
1504 The output buffer has to provide at least length+1 bytes of storage
1505 area. The output string is 0-terminated.
1506
1507 The encoder converts whitespace to ' ', decimal characters to their
1508 corresponding ASCII digit and all other Latin-1 characters except
1509 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1510 are treated as errors. This includes embedded NULL bytes.
1511
1512 Error handling is defined by the errors argument:
1513
1514 NULL or "strict": raise a ValueError
1515 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001516 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001517 "replace": replaces illegal characters with '?'
1518
1519 Returns 0 on success, -1 on failure.
1520
1521*/
1522
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001523#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001524PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 Py_UNICODE *s, /* Unicode buffer */
1526 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1527 char *output, /* Output buffer; must have size >= length */
1528 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001529 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001530#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001531
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001532/* Transforms code points that have decimal digit property to the
1533 corresponding ASCII digit code points.
1534
1535 Returns a new Unicode string on success, NULL on failure.
1536*/
1537
Georg Brandlb5503082010-12-05 11:40:48 +00001538#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001539PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1540 Py_UNICODE *s, /* Unicode buffer */
1541 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1542 );
Georg Brandlb5503082010-12-05 11:40:48 +00001543#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001545/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1546 as argument instead of a raw buffer and length. This function additionally
1547 transforms spaces to ASCII because this is what the callers in longobject,
1548 floatobject, and complexobject did anyways. */
1549
1550#ifndef Py_LIMITED_API
1551PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1552 PyObject *unicode /* Unicode object */
1553 );
1554#endif
1555
Martin v. Löwis011e8422009-05-05 04:43:17 +00001556/* --- File system encoding ---------------------------------------------- */
1557
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001558/* ParseTuple converter: encode str objects to bytes using
1559 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560
1561PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1562
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001563/* ParseTuple converter: decode bytes objects to unicode using
1564 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1565
1566PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1567
Victor Stinner77c38622010-05-14 15:58:55 +00001568/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1569 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001570
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001571 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1572 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001573
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001574 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001575*/
1576
1577PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1578 const char *s /* encoded string */
1579 );
1580
Victor Stinner77c38622010-05-14 15:58:55 +00001581/* Decode a string using Py_FileSystemDefaultEncoding
1582 and the "surrogateescape" error handler.
1583
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001584 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1585 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001586*/
1587
Martin v. Löwis011e8422009-05-05 04:43:17 +00001588PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1589 const char *s, /* encoded string */
1590 Py_ssize_t size /* size */
1591 );
1592
Victor Stinnerae6265f2010-05-15 16:27:27 +00001593/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001594 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001595
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001596 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1597 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001598*/
1599
1600PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1601 PyObject *unicode
1602 );
1603
Guido van Rossumd8225182000-03-10 22:33:05 +00001604/* --- Methods & Slots ----------------------------------------------------
1605
1606 These are capable of handling Unicode objects and strings on input
1607 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001608 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001609
1610/* Concat two strings giving a new Unicode string. */
1611
Mark Hammond91a681d2002-08-12 07:21:58 +00001612PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 PyObject *left, /* Left string */
1614 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001615 );
1616
Walter Dörwald1ab83302007-05-18 17:15:44 +00001617/* Concat two strings and put the result in *pleft
1618 (sets *pleft to NULL on error) */
1619
1620PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001621 PyObject **pleft, /* Pointer to left string */
1622 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001623 );
1624
1625/* Concat two strings, put the result in *pleft and drop the right object
1626 (sets *pleft to NULL on error) */
1627
1628PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001629 PyObject **pleft, /* Pointer to left string */
1630 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001631 );
1632
Guido van Rossumd8225182000-03-10 22:33:05 +00001633/* Split a string giving a list of Unicode strings.
1634
1635 If sep is NULL, splitting will be done at all whitespace
1636 substrings. Otherwise, splits occur at the given separator.
1637
1638 At most maxsplit splits will be done. If negative, no limit is set.
1639
1640 Separators are not included in the resulting list.
1641
1642*/
1643
Mark Hammond91a681d2002-08-12 07:21:58 +00001644PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001645 PyObject *s, /* String to split */
1646 PyObject *sep, /* String separator */
1647 Py_ssize_t maxsplit /* Maxsplit count */
1648 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001649
1650/* Dito, but split at line breaks.
1651
1652 CRLF is considered to be one line break. Line breaks are not
1653 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001654
Mark Hammond91a681d2002-08-12 07:21:58 +00001655PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001656 PyObject *s, /* String to split */
1657 int keepends /* If true, line end markers are included */
1658 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001659
Thomas Wouters477c8d52006-05-27 19:21:47 +00001660/* Partition a string using a given separator. */
1661
1662PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 PyObject *s, /* String to partition */
1664 PyObject *sep /* String separator */
1665 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001666
1667/* Partition a string using a given separator, searching from the end of the
1668 string. */
1669
1670PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 PyObject *s, /* String to partition */
1672 PyObject *sep /* String separator */
1673 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001674
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001675/* Split a string giving a list of Unicode strings.
1676
1677 If sep is NULL, splitting will be done at all whitespace
1678 substrings. Otherwise, splits occur at the given separator.
1679
1680 At most maxsplit splits will be done. But unlike PyUnicode_Split
1681 PyUnicode_RSplit splits from the end of the string. If negative,
1682 no limit is set.
1683
1684 Separators are not included in the resulting list.
1685
1686*/
1687
1688PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 PyObject *s, /* String to split */
1690 PyObject *sep, /* String separator */
1691 Py_ssize_t maxsplit /* Maxsplit count */
1692 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001693
Guido van Rossumd8225182000-03-10 22:33:05 +00001694/* Translate a string by applying a character mapping table to it and
1695 return the resulting Unicode object.
1696
1697 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001698 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001699
1700 Mapping tables may be dictionaries or sequences. Unmapped character
1701 ordinals (ones which cause a LookupError) are left untouched and
1702 are copied as-is.
1703
1704*/
1705
Mark Hammond91a681d2002-08-12 07:21:58 +00001706PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 PyObject *str, /* String */
1708 PyObject *table, /* Translate table */
1709 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001710 );
1711
1712/* Join a sequence of strings using the given separator and return
1713 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714
Mark Hammond91a681d2002-08-12 07:21:58 +00001715PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001716 PyObject *separator, /* Separator string */
1717 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001718 );
1719
1720/* Return 1 if substr matches str[start:end] at the given tail end, 0
1721 otherwise. */
1722
Martin v. Löwis18e16552006-02-15 17:27:45 +00001723PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 PyObject *str, /* String */
1725 PyObject *substr, /* Prefix or Suffix string */
1726 Py_ssize_t start, /* Start index */
1727 Py_ssize_t end, /* Stop index */
1728 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001729 );
1730
1731/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001732 given search direction or -1 if not found. -2 is returned in case
1733 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001734
Martin v. Löwis18e16552006-02-15 17:27:45 +00001735PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736 PyObject *str, /* String */
1737 PyObject *substr, /* Substring to find */
1738 Py_ssize_t start, /* Start index */
1739 Py_ssize_t end, /* Stop index */
1740 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001741 );
1742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743/* Like PyUnicode_Find, but search for single character only. */
1744PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1745 PyObject *str,
1746 Py_UCS4 ch,
1747 Py_ssize_t start,
1748 Py_ssize_t end,
1749 int direction
1750 );
1751
Barry Warsaw51ac5802000-03-20 16:36:48 +00001752/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001753
Martin v. Löwis18e16552006-02-15 17:27:45 +00001754PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001755 PyObject *str, /* String */
1756 PyObject *substr, /* Substring to count */
1757 Py_ssize_t start, /* Start index */
1758 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001759 );
1760
Barry Warsaw51ac5802000-03-20 16:36:48 +00001761/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001762 and return the resulting Unicode object. */
1763
Mark Hammond91a681d2002-08-12 07:21:58 +00001764PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 PyObject *str, /* String */
1766 PyObject *substr, /* Substring to find */
1767 PyObject *replstr, /* Substring to replace */
1768 Py_ssize_t maxcount /* Max. number of replacements to apply;
1769 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001770 );
1771
1772/* Compare two strings and return -1, 0, 1 for less than, equal,
1773 greater than resp. */
1774
Mark Hammond91a681d2002-08-12 07:21:58 +00001775PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001776 PyObject *left, /* Left string */
1777 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001778 );
1779
Martin v. Löwis5b222132007-06-10 09:51:05 +00001780PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1781 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001782 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001783 );
1784
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001785/* Rich compare two strings and return one of the following:
1786
1787 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001788 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001789 - Py_NotImplemented in case the type combination is unknown
1790
1791 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1792 case the conversion of the arguments to Unicode fails with a
1793 UnicodeDecodeError.
1794
1795 Possible values for op:
1796
1797 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1798
1799*/
1800
1801PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 PyObject *left, /* Left string */
1803 PyObject *right, /* Right string */
1804 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001805 );
1806
Thomas Wouters7e474022000-07-16 12:04:32 +00001807/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001808 the resulting Unicode string. */
1809
Mark Hammond91a681d2002-08-12 07:21:58 +00001810PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 PyObject *format, /* Format string */
1812 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001813 );
1814
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001815/* Checks whether element is contained in container and return 1/0
1816 accordingly.
1817
1818 element has to coerce to an one element Unicode string. -1 is
1819 returned in case of an error. */
1820
Mark Hammond91a681d2002-08-12 07:21:58 +00001821PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001822 PyObject *container, /* Container string */
1823 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001824 );
1825
Martin v. Löwis47383402007-08-15 07:32:56 +00001826/* Checks whether argument is a valid identifier. */
1827
1828PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1829
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001830#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001831/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001832PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001833 PyUnicodeObject *self,
1834 int striptype,
1835 PyObject *sepobj
1836 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001837#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001838
Eric Smith5807c412008-05-11 21:00:57 +00001839/* Using the current locale, insert the thousands grouping
1840 into the string pointed to by buffer. For the argument descriptions,
1841 see Objects/stringlib/localeutil.h */
1842
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001843#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001844PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1845 Py_ssize_t n_buffer,
1846 Py_UNICODE *digits,
1847 Py_ssize_t n_digits,
1848 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001849#endif
Eric Smith5807c412008-05-11 21:00:57 +00001850
Eric Smitha3b1ac82009-04-03 14:45:06 +00001851/* Using explicit passed-in values, insert the thousands grouping
1852 into the string pointed to by buffer. For the argument descriptions,
1853 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001854#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001856 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 int kind,
1858 void *buffer,
1859 Py_ssize_t n_buffer,
1860 void *digits,
1861 Py_ssize_t n_digits,
1862 Py_ssize_t min_width,
1863 const char *grouping,
1864 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001865#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001866/* === Characters Type APIs =============================================== */
1867
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001868/* Helper array used by Py_UNICODE_ISSPACE(). */
1869
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001870#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001871PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1872
Guido van Rossumd8225182000-03-10 22:33:05 +00001873/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001874 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001875
1876 These APIs are implemented in Objects/unicodectype.c.
1877
1878*/
1879
Mark Hammond91a681d2002-08-12 07:21:58 +00001880PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001881 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001882 );
1883
Mark Hammond91a681d2002-08-12 07:21:58 +00001884PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001885 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001886 );
1887
Mark Hammond91a681d2002-08-12 07:21:58 +00001888PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001889 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001890 );
1891
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001892PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001893 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001894 );
1895
1896PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001897 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001898 );
1899
Mark Hammond91a681d2002-08-12 07:21:58 +00001900PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001901 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001902 );
1903
Mark Hammond91a681d2002-08-12 07:21:58 +00001904PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001905 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001906 );
1907
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001908PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1909 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001910 );
1911
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001912PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1913 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001914 );
1915
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001916PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1917 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001918 );
1919
Mark Hammond91a681d2002-08-12 07:21:58 +00001920PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001921 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001922 );
1923
Mark Hammond91a681d2002-08-12 07:21:58 +00001924PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001925 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001926 );
1927
Mark Hammond91a681d2002-08-12 07:21:58 +00001928PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001929 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001930 );
1931
Mark Hammond91a681d2002-08-12 07:21:58 +00001932PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001933 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001934 );
1935
Mark Hammond91a681d2002-08-12 07:21:58 +00001936PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001937 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001938 );
1939
Mark Hammond91a681d2002-08-12 07:21:58 +00001940PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001941 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001942 );
1943
Georg Brandl559e5d72008-06-11 18:37:52 +00001944PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001945 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001946 );
1947
Mark Hammond91a681d2002-08-12 07:21:58 +00001948PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001949 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001950 );
1951
Victor Stinneref8d95c2010-08-16 22:03:11 +00001952PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1953 const Py_UNICODE *u
1954 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001955
1956PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001957 Py_UNICODE *s1,
1958 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001959
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1961 Py_UNICODE *s1, const Py_UNICODE *s2);
1962
Martin v. Löwis5b222132007-06-10 09:51:05 +00001963PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001964 Py_UNICODE *s1,
1965 const Py_UNICODE *s2,
1966 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967
1968PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001969 const Py_UNICODE *s1,
1970 const Py_UNICODE *s2
1971 );
1972
1973PyAPI_FUNC(int) Py_UNICODE_strncmp(
1974 const Py_UNICODE *s1,
1975 const Py_UNICODE *s2,
1976 size_t n
1977 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001978
1979PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001980 const Py_UNICODE *s,
1981 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001982 );
1983
Victor Stinner331ea922010-08-10 16:37:20 +00001984PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001985 const Py_UNICODE *s,
1986 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001987 );
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989PyAPI_FUNC(size_t) Py_UCS4_strlen(
1990 const Py_UCS4 *u
1991 );
1992
1993PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1994 Py_UCS4 *s1,
1995 const Py_UCS4 *s2);
1996
1997PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1998 Py_UCS4 *s1, const Py_UCS4 *s2);
1999
2000PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
2001 Py_UCS4 *s1,
2002 const Py_UCS4 *s2,
2003 size_t n);
2004
2005PyAPI_FUNC(int) Py_UCS4_strcmp(
2006 const Py_UCS4 *s1,
2007 const Py_UCS4 *s2
2008 );
2009
2010PyAPI_FUNC(int) Py_UCS4_strncmp(
2011 const Py_UCS4 *s1,
2012 const Py_UCS4 *s2,
2013 size_t n
2014 );
2015
2016PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
2017 const Py_UCS4 *s,
2018 Py_UCS4 c
2019 );
2020
2021PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2022 const Py_UCS4 *s,
2023 Py_UCS4 c
2024 );
2025
Victor Stinner71133ff2010-09-01 23:43:53 +00002026/* Create a copy of a unicode string ending with a nul character. Return NULL
2027 and raise a MemoryError exception on memory allocation failure, otherwise
2028 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2029
Victor Stinner46408602010-09-03 16:18:00 +00002030PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002031 PyObject *unicode
2032 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002033#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002034
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002035#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2036/* FIXME: use PyObject* type for op */
2037PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2038 void *op,
2039 int check_content);
2040#endif
2041
Guido van Rossumd8225182000-03-10 22:33:05 +00002042#ifdef __cplusplus
2043}
2044#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002045#endif /* !Py_UNICODEOBJECT_H */