blob: dc3bad7751bf8c0d9a9dc6ed2fed5644410575b4 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200209 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
241 * kind = PyUnicode_WCHAR_KIND
242 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200243 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * ready = 0
245 * wstr is not NULL
246 * data.any is NULL
247 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200248 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200249 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200250
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200251 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200252
253 * structure = PyUnicodeObject structure
254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255 PyUnicode_4BYTE_KIND
256 * compact = 0
257 * ready = 1
258 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200259 * utf8 is shared and utf8_length = length with data.any if ascii = 1
260 * utf8_length = 0 if utf8 is NULL
261 * wstr is shared and wstr_length = length with data.any
262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200265
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200266 Compact strings use only one memory block (structure + characters),
267 whereas legacy strings use one block for the structure and one block
268 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200269
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200270 Legacy strings are created by PyUnicode_FromUnicode() and
271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272 when PyUnicode_READY() is called.
273
274 See also _PyUnicode_CheckConsistency().
275 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000276 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000278 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 struct {
280 /*
281 SSTATE_NOT_INTERNED (0)
282 SSTATE_INTERNED_MORTAL (1)
283 SSTATE_INTERNED_IMMORTAL (2)
284
285 If interned != SSTATE_NOT_INTERNED, the two references from the
286 dictionary to this object are *not* counted in ob_refcnt.
287 */
288 unsigned int interned:2;
289 /* Character size:
290
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200291 - PyUnicode_WCHAR_KIND (0):
292
293 * character type = wchar_t (16 or 32 bits, depending on the
294 platform)
295
296 - PyUnicode_1BYTE_KIND (1):
297
298 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200299 * if ascii is set, all characters must be in range
300 U+0000-U+007F, otherwise at least one character must be in range
301 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302
303 - PyUnicode_2BYTE_KIND (2):
304
305 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200306 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200307
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200308 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200309
310 * character type = Py_UCS4 (32 bits, unsigned)
311 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200313 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 /* Compact is with respect to the allocation scheme. Compact unicode
315 objects only require one memory block while non-compact objects use
316 one block for the PyUnicodeObject struct and another for its data
317 buffer. */
318 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200319 /* The string only contains characters in range U+0000-U+007F (ASCII)
320 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
321 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 unsigned int ascii:1;
323 /* The ready flag indicates whether the object layout is initialized
324 completely. This means that this is either a compact object, or
325 the data pointer is filled out. The bit is redundant, and helps
326 to minimize the test in PyUnicode_IS_READY(). */
327 unsigned int ready:1;
328 } state;
329 wchar_t *wstr; /* wchar_t representation (null-terminated) */
330} PyASCIIObject;
331
332/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200333 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 immediately follow the structure. */
335typedef struct {
336 PyASCIIObject _base;
337 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
338 * terminating \0. */
339 char *utf8; /* UTF-8 representation (null-terminated) */
340 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
341 * surrogates count as two code points. */
342} PyCompactUnicodeObject;
343
344/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
345 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200346 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347typedef struct {
348 PyCompactUnicodeObject _base;
349 union {
350 void *any;
351 Py_UCS1 *latin1;
352 Py_UCS2 *ucs2;
353 Py_UCS4 *ucs4;
354 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000355} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000356#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000357
Mark Hammond91a681d2002-08-12 07:21:58 +0000358PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000359PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000360
Thomas Wouters27d517b2007-02-25 20:39:11 +0000361#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000362 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
363#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
365/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000366#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367
368#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200370 ((PyASCIIObject*)op)->length : \
371 ((PyCompactUnicodeObject*)op)->wstr_length)
372
373/* Returns the deprecated Py_UNICODE representation's size in code units
374 (this includes surrogate pairs as 2 units).
375 If the Py_UNICODE representation is not available, it will be computed
376 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
377
Guido van Rossumd8225182000-03-10 22:33:05 +0000378#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200379 (assert(PyUnicode_Check(op)), \
380 (((PyASCIIObject *)(op))->wstr) ? \
381 PyUnicode_WSTR_LENGTH(op) : \
382 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
383 PyUnicode_WSTR_LENGTH(op)))
384
Guido van Rossumd8225182000-03-10 22:33:05 +0000385#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
387
388/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
389 representation on demand. Using this macro is very inefficient now,
390 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
391 use PyUnicode_WRITE() and PyUnicode_READ(). */
392
Guido van Rossumd8225182000-03-10 22:33:05 +0000393#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200394 (assert(PyUnicode_Check(op)), \
395 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
396 PyUnicode_AsUnicode((PyObject *)(op)))
397
Guido van Rossumd8225182000-03-10 22:33:05 +0000398#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399 ((const char *)(PyUnicode_AS_UNICODE(op)))
400
401
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200402/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
404/* Values for PyUnicodeObject.state: */
405
406/* Interning state. */
407#define SSTATE_NOT_INTERNED 0
408#define SSTATE_INTERNED_MORTAL 1
409#define SSTATE_INTERNED_IMMORTAL 2
410
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411/* Return true if the string contains only ASCII characters, or 0 if not. The
412 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
413 or Ready calls are performed. */
414#define PyUnicode_IS_ASCII(op) \
415 (((PyASCIIObject*)op)->state.ascii)
416
417/* Return true if the string is compact or 0 if not.
418 No type checks or Ready calls are performed. */
419#define PyUnicode_IS_COMPACT(op) \
420 (((PyASCIIObject*)(op))->state.compact)
421
422/* Return true if the string is a compact ASCII string (use PyASCIIObject
423 structure), or 0 if not. No type checks or Ready calls are performed. */
424#define PyUnicode_IS_COMPACT_ASCII(op) \
425 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200426
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200427enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200429 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200431 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200433 PyUnicode_1BYTE_KIND = 1,
434 PyUnicode_2BYTE_KIND = 2,
435 PyUnicode_4BYTE_KIND = 4
436};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200437
Georg Brandl4975a9b2011-10-05 16:12:21 +0200438/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200440 No checks are performed, use PyUnicode_KIND() before to ensure
441 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442
443#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
444#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
445#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
446
Victor Stinner157f83f2011-09-28 21:41:31 +0200447/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200448#define PyUnicode_KIND(op) \
449 (assert(PyUnicode_Check(op)), \
450 assert(PyUnicode_IS_READY(op)), \
451 ((PyASCIIObject *)(op))->state.kind)
452
Victor Stinner157f83f2011-09-28 21:41:31 +0200453/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200455 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456 ((void*)((PyASCIIObject*)(op) + 1)) : \
457 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
458
459#define _PyUnicode_NONCOMPACT_DATA(op) \
460 (assert(((PyUnicodeObject*)(op))->data.any), \
461 ((((PyUnicodeObject *)(op))->data.any)))
462
463#define PyUnicode_DATA(op) \
464 (assert(PyUnicode_Check(op)), \
465 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
466 _PyUnicode_NONCOMPACT_DATA(op))
467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200468/* In the access macros below, "kind" may be evaluated more than once.
469 All other macro parameters are evaluated exactly once, so it is safe
470 to put side effects into them (such as increasing the index). */
471
472/* Write into the canonical representation, this macro does not do any sanity
473 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200474 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200476 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477#define PyUnicode_WRITE(kind, data, index, value) \
478 do { \
479 switch ((kind)) { \
480 case PyUnicode_1BYTE_KIND: { \
481 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
482 break; \
483 } \
484 case PyUnicode_2BYTE_KIND: { \
485 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
486 break; \
487 } \
488 default: { \
489 assert((kind) == PyUnicode_4BYTE_KIND); \
490 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
491 } \
492 } \
493 } while (0)
494
Georg Brandl07de3252011-10-05 16:47:38 +0200495/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 or ready calls are performed. */
497#define PyUnicode_READ(kind, data, index) \
498 ((Py_UCS4) \
499 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200500 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200501 ((kind) == PyUnicode_2BYTE_KIND ? \
502 ((const Py_UCS2 *)(data))[(index)] : \
503 ((const Py_UCS4 *)(data))[(index)] \
504 ) \
505 ))
506
507/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
508 calls PyUnicode_KIND() and might call it twice. For single reads, use
509 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
510 cache kind and use PyUnicode_READ instead. */
511#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200512 (assert(PyUnicode_Check(unicode)), \
513 assert(PyUnicode_IS_READY(unicode)), \
514 (Py_UCS4) \
515 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
516 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
517 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
518 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
519 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
520 ) \
521 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522
523/* Returns the length of the unicode string. The caller has to make sure that
524 the string has it's canonical representation set before calling
525 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
526#define PyUnicode_GET_LENGTH(op) \
527 (assert(PyUnicode_Check(op)), \
528 assert(PyUnicode_IS_READY(op)), \
529 ((PyASCIIObject *)(op))->length)
530
531
532/* Fast check to determine whether an object is ready. Equivalent to
533 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
534
535#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
536
Victor Stinnera3b334d2011-10-03 13:53:37 +0200537/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200539 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200540 Returns 0 on success and -1 on errors. */
541#define PyUnicode_READY(op) \
542 (assert(PyUnicode_Check(op)), \
543 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200544 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546/* Return a maximum character value which is suitable for creating another
547 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200548 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200549#define PyUnicode_MAX_CHAR_VALUE(op) \
550 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200551 (PyUnicode_IS_ASCII(op) ? \
552 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200554 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200556 (0xffffU) : \
557 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000559#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000560
561/* --- Constants ---------------------------------------------------------- */
562
563/* This Unicode character will be used as replacement character during
564 decoding if the errors argument is set to "replace". Note: the
565 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
566 Unicode 3.0. */
567
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200568#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000569
570/* === Public API ========================================================= */
571
572/* --- Plain Py_UNICODE --------------------------------------------------- */
573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574/* With PEP 393, this is the recommended way to allocate a new unicode object.
575 This function will allocate the object and its buffer in a single memory
576 block. Objects created using this function are not resizable. */
577#ifndef Py_LIMITED_API
578PyAPI_FUNC(PyObject*) PyUnicode_New(
579 Py_ssize_t size, /* Number of code points in the new string */
580 Py_UCS4 maxchar /* maximum code point value in the string */
581 );
582#endif
583
Victor Stinnerd8f65102011-09-29 19:43:17 +0200584/* Initializes the canonical string representation from a the deprecated
585 wstr/Py_UNICODE representation. This function is used to convert Unicode
586 objects which were created using the old API to the new flexible format
587 introduced with PEP 393.
588
589 Don't call this function directly, use the public PyUnicode_READY() macro
590 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591#ifndef Py_LIMITED_API
592PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200593 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594 );
595#endif
596
Victor Stinner034f6cf2011-09-30 02:26:44 +0200597/* Get a copy of a Unicode string. */
598PyAPI_FUNC(PyObject*) PyUnicode_Copy(
599 PyObject *unicode
600 );
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200603 character conversion when necessary and falls back to memcpy if possible.
604
Victor Stinnera0702ab2011-09-29 14:14:38 +0200605 Fail if to is too small (smaller than how_many or smaller than
606 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
607 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200608
609 Return the number of written character, or return -1 and raise an exception
610 on error.
611
612 Pseudo-code:
613
614 how_many = min(how_many, len(from) - from_start)
615 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
616 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200617
618 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200619 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622 PyObject *to,
623 Py_ssize_t to_start,
624 PyObject *from,
625 Py_ssize_t from_start,
626 Py_ssize_t how_many
627 );
628#endif
629
Guido van Rossumd8225182000-03-10 22:33:05 +0000630/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000632
633 u may be NULL which causes the contents to be undefined. It is the
634 user's responsibility to fill in the needed data afterwards. Note
635 that modifying the Unicode object contents after construction is
636 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000637
638 The buffer is copied into the new object. */
639
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000640#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000641PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000642 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000643 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000644 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000645#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000646
Georg Brandl952867a2010-06-27 10:17:12 +0000647/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000648PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000649 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000650 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000651 );
652
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000653/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000655PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000656 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000657 );
658
Victor Stinnerb9275c12011-10-05 14:01:42 +0200659/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
660 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661#ifndef Py_LIMITED_API
662PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
663 int kind,
664 const void *buffer,
665 Py_ssize_t size);
666#endif
667
668PyAPI_FUNC(PyObject*) PyUnicode_Substring(
669 PyObject *str,
670 Py_ssize_t start,
671 Py_ssize_t end);
672
Georg Brandldb6c7f52011-10-07 11:19:11 +0200673/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674 is set. Return NULL and raise an exception on error. Raise a ValueError if
675 the buffer is smaller than the string. Return buffer on success.
676
677 buflen is the length of the buffer in (Py_UCS4) characters. */
678PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
679 PyObject *unicode,
680 Py_UCS4* buffer,
681 Py_ssize_t buflen,
682 int copy_null);
683
684/* Copy the string into a UCS4 buffer. A new buffer is allocated using
685 * PyMem_Malloc; if this fails, NULL is returned with a memory error
686 exception set. */
687PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
688
Guido van Rossumd8225182000-03-10 22:33:05 +0000689/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690 Py_UNICODE buffer.
691 If the wchar_t/Py_UNICODE representation is not yet available, this
692 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000693
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000694#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000695PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000697 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000698#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700/* Return a read-only pointer to the Unicode object's internal
701 Py_UNICODE buffer and save the length at size.
702 If the wchar_t/Py_UNICODE representation is not yet available, this
703 function will calculate it. */
704
705#ifndef Py_LIMITED_API
706PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
707 PyObject *unicode, /* Unicode object */
708 Py_ssize_t *size /* location where to save the length */
709 );
710#endif
711
Guido van Rossumd8225182000-03-10 22:33:05 +0000712/* Get the length of the Unicode object. */
713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
715 PyObject *unicode
716);
717
Victor Stinner157f83f2011-09-28 21:41:31 +0200718/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719 string representation. */
720
Martin v. Löwis18e16552006-02-15 17:27:45 +0000721PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000723 );
724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725/* Read a character from the string. */
726
727PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
728 PyObject *unicode,
729 Py_ssize_t index
730 );
731
732/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200733 PyUnicode_New, must not be shared, and must not have been hashed yet.
734
735 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736
737PyAPI_FUNC(int) PyUnicode_WriteChar(
738 PyObject *unicode,
739 Py_ssize_t index,
740 Py_UCS4 character
741 );
742
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000743#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000744/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000745PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000746#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000747
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200748/* Resize an Unicode object allocated by the legacy API (e.g.
749 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
750 PyUnicode_New) cannot be resized by this function.
751
752 The length is a number of Py_UNICODE characters (and not the number of code
753 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000754
755 *unicode is modified to point to the new (resized) object and 0
756 returned on success.
757
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200758 If the refcount on the object is 1, the function resizes the string in
759 place, which is usually faster than allocating a new string (and copy
760 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000761
762 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200763 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000764
Mark Hammond91a681d2002-08-12 07:21:58 +0000765PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 PyObject **unicode, /* Pointer to the Unicode object */
767 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000768 );
769
Guido van Rossumd8225182000-03-10 22:33:05 +0000770/* Coerce obj to an Unicode object and return a reference with
771 *incremented* refcount.
772
773 Coercion is done in the following way:
774
Georg Brandl952867a2010-06-27 10:17:12 +0000775 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000776 under the assumptions that they contain data using the UTF-8
777 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000778
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000779 2. All other objects (including Unicode objects) raise an
780 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000781
782 The API returns NULL in case of an error. The caller is responsible
783 for decref'ing the returned objects.
784
785*/
786
Mark Hammond91a681d2002-08-12 07:21:58 +0000787PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000789 const char *encoding, /* encoding */
790 const char *errors /* error handling */
791 );
792
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000793/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000794 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000796 Unicode objects are passed back as-is (subclasses are converted to
797 true Unicode objects), all other objects are delegated to
798 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000799 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000800
801 The API returns NULL in case of an error. The caller is responsible
802 for decref'ing the returned objects.
803
804*/
805
Mark Hammond91a681d2002-08-12 07:21:58 +0000806PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000808 );
809
Victor Stinner1205f272010-09-11 00:54:47 +0000810PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
811 const char *format, /* ASCII-encoded string */
812 va_list vargs
813 );
814PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
815 const char *format, /* ASCII-encoded string */
816 ...
817 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000819#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000820/* Format the object based on the format_spec, as defined in PEP 3101
821 (Advanced String Formatting). */
822PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 PyObject *format_spec,
824 Py_ssize_t start,
825 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000826#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000827
Walter Dörwald16807132007-05-25 13:52:07 +0000828PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
829PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000830PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
831 const char *u /* UTF-8 encoded string */
832 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000833#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000834PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000836
837/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838#define PyUnicode_CHECK_INTERNED(op) \
839 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000840
Guido van Rossumd8225182000-03-10 22:33:05 +0000841/* --- wchar_t support for platforms which support it --------------------- */
842
843#ifdef HAVE_WCHAR_H
844
Georg Brandl952867a2010-06-27 10:17:12 +0000845/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000846 size.
847
848 The buffer is copied into the new object. */
849
Mark Hammond91a681d2002-08-12 07:21:58 +0000850PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000851 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000852 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000853 );
854
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000855/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000856 most size wchar_t characters are copied.
857
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000858 Note that the resulting wchar_t string may or may not be
859 0-terminated. It is the responsibility of the caller to make sure
860 that the wchar_t string is 0-terminated in case this is required by
861 the application.
862
863 Returns the number of wchar_t characters copied (excluding a
864 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000865 error. */
866
Martin v. Löwis18e16552006-02-15 17:27:45 +0000867PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000868 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000869 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000871 );
872
Victor Stinner137c34c2010-09-29 10:25:54 +0000873/* Convert the Unicode object to a wide character string. The output string
874 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200875 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000876
877 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
878 on success. On error, returns NULL, *size is undefined and raises a
879 MemoryError. */
880
881PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000882 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000883 Py_ssize_t *size /* number of characters of the result */
884 );
885
Victor Stinner9f789e72011-10-01 03:57:28 +0200886#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200888#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889
Guido van Rossumd8225182000-03-10 22:33:05 +0000890#endif
891
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000892/* --- Unicode ordinals --------------------------------------------------- */
893
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894/* Create a Unicode Object from the given Unicode code point ordinal.
895
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000896 The ordinal must be in range(0x10000) on narrow Python builds
897 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
898 raised in case it is not.
899
900*/
901
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000902PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000903
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000904/* --- Free-list management ----------------------------------------------- */
905
906/* Clear the free list used by the Unicode implementation.
907
908 This can be used to release memory used for objects on the free
909 list back to the Python memory allocator.
910
911*/
912
913PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
914
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000916
917 Many of these APIs take two arguments encoding and errors. These
918 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000919 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000920
Georg Brandl952867a2010-06-27 10:17:12 +0000921 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000922
923 Error handling is set by errors which may also be set to NULL
924 meaning to use the default handling defined for the codec. Default
925 error handling for all builtin codecs is "strict" (ValueErrors are
926 raised).
927
928 The codecs all use a similar interface. Only deviation from the
929 generic ones are documented.
930
931*/
932
Fred Drakecb093fe2000-05-09 19:51:53 +0000933/* --- Manage the default encoding ---------------------------------------- */
934
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000935/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000936 Unicode object unicode and the size of the encoded representation
937 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000938
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000939 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000940
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200941 This function caches the UTF-8 encoded string in the unicodeobject
942 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943 when the unicodeobject is deallocated.
944
945 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
946 support the previous internal function with the same behaviour.
947
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000948 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000949 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000950
951 *** If you need to access the Unicode object as UTF-8 bytes string,
952 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000953*/
954
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000955#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000957 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000958 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000960#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000961
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000962/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000963 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
966 in the unicodeobject.
967
968 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
969 support the previous internal function with the same behaviour.
970
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000971 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000972 extracted from the returned data.
973
974 *** This API is for interpreter INTERNAL USE ONLY and will likely
975 *** be removed or changed for Python 3.1.
976
977 *** If you need to access the Unicode object as UTF-8 bytes string,
978 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000979
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000980*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000981
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000982#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
984#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000985#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000986
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000987/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000988
Mark Hammond91a681d2002-08-12 07:21:58 +0000989PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000990
Guido van Rossumd8225182000-03-10 22:33:05 +0000991/* --- Generic Codecs ----------------------------------------------------- */
992
993/* Create a Unicode object by decoding the encoded string s of the
994 given size. */
995
Mark Hammond91a681d2002-08-12 07:21:58 +0000996PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000997 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000998 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const char *encoding, /* encoding */
1000 const char *errors /* error handling */
1001 );
1002
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001003/* Decode a Unicode object unicode and return the result as Python
1004 object. */
1005
1006PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001007 PyObject *unicode, /* Unicode object */
1008 const char *encoding, /* encoding */
1009 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001010 );
1011
1012/* Decode a Unicode object unicode and return the result as Unicode
1013 object. */
1014
1015PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 PyObject *unicode, /* Unicode object */
1017 const char *encoding, /* encoding */
1018 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001019 );
1020
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001022 Python string object. */
1023
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001024#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001025PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001026 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001028 const char *encoding, /* encoding */
1029 const char *errors /* error handling */
1030 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001031#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001032
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001033/* Encodes a Unicode object and returns the result as Python
1034 object. */
1035
1036PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037 PyObject *unicode, /* Unicode object */
1038 const char *encoding, /* encoding */
1039 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001040 );
1041
Guido van Rossumd8225182000-03-10 22:33:05 +00001042/* Encodes a Unicode object and returns the result as Python string
1043 object. */
1044
Mark Hammond91a681d2002-08-12 07:21:58 +00001045PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 PyObject *unicode, /* Unicode object */
1047 const char *encoding, /* encoding */
1048 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001049 );
1050
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001051/* Encodes a Unicode object and returns the result as Unicode
1052 object. */
1053
1054PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 PyObject *unicode, /* Unicode object */
1056 const char *encoding, /* encoding */
1057 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001058 );
1059
1060/* Build an encoding map. */
1061
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001062PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1063 PyObject* string /* 256 character map */
1064 );
1065
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066/* --- UTF-7 Codecs ------------------------------------------------------- */
1067
Mark Hammond91a681d2002-08-12 07:21:58 +00001068PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 const char *string, /* UTF-7 encoded string */
1070 Py_ssize_t length, /* size of string */
1071 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072 );
1073
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001074PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001075 const char *string, /* UTF-7 encoded string */
1076 Py_ssize_t length, /* size of string */
1077 const char *errors, /* error handling */
1078 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001079 );
1080
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001081#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001082PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001083 const Py_UNICODE *data, /* Unicode char buffer */
1084 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1085 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1086 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1087 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001089#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090
Guido van Rossumd8225182000-03-10 22:33:05 +00001091/* --- UTF-8 Codecs ------------------------------------------------------- */
1092
Mark Hammond91a681d2002-08-12 07:21:58 +00001093PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 const char *string, /* UTF-8 encoded string */
1095 Py_ssize_t length, /* size of string */
1096 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001097 );
1098
Walter Dörwald69652032004-09-07 20:24:22 +00001099PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 const char *string, /* UTF-8 encoded string */
1101 Py_ssize_t length, /* size of string */
1102 const char *errors, /* error handling */
1103 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001104 );
1105
Mark Hammond91a681d2002-08-12 07:21:58 +00001106PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001108 );
1109
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001110#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1112 PyObject *unicode,
1113 const char *errors);
1114
Mark Hammond91a681d2002-08-12 07:21:58 +00001115PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 const Py_UNICODE *data, /* Unicode char buffer */
1117 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1118 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001119 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001120#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001121
Walter Dörwald41980ca2007-08-16 21:55:45 +00001122/* --- UTF-32 Codecs ------------------------------------------------------ */
1123
1124/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1125 the corresponding Unicode object.
1126
1127 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001129
1130 If byteorder is non-NULL, the decoder starts decoding using the
1131 given byte order:
1132
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001133 *byteorder == -1: little endian
1134 *byteorder == 0: native order
1135 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001136
1137 In native mode, the first four bytes of the stream are checked for a
1138 BOM mark. If found, the BOM mark is analysed, the byte order
1139 adjusted and the BOM skipped. In the other modes, no BOM mark
1140 interpretation is done. After completion, *byteorder is set to the
1141 current byte order at the end of input data.
1142
1143 If byteorder is NULL, the codec starts in native order mode.
1144
1145*/
1146
1147PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 const char *string, /* UTF-32 encoded string */
1149 Py_ssize_t length, /* size of string */
1150 const char *errors, /* error handling */
1151 int *byteorder /* pointer to byteorder to use
1152 0=native;-1=LE,1=BE; updated on
1153 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001154 );
1155
1156PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 const char *string, /* UTF-32 encoded string */
1158 Py_ssize_t length, /* size of string */
1159 const char *errors, /* error handling */
1160 int *byteorder, /* pointer to byteorder to use
1161 0=native;-1=LE,1=BE; updated on
1162 exit */
1163 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001164 );
1165
1166/* Returns a Python string using the UTF-32 encoding in native byte
1167 order. The string always starts with a BOM mark. */
1168
1169PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001171 );
1172
1173/* Returns a Python string object holding the UTF-32 encoded value of
1174 the Unicode data.
1175
1176 If byteorder is not 0, output is written according to the following
1177 byte order:
1178
1179 byteorder == -1: little endian
1180 byteorder == 0: native byte order (writes a BOM mark)
1181 byteorder == 1: big endian
1182
1183 If byteorder is 0, the output string will always start with the
1184 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1185 prepended.
1186
1187*/
1188
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001189#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001190PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001191 const Py_UNICODE *data, /* Unicode char buffer */
1192 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1193 const char *errors, /* error handling */
1194 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001195 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001196#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001197
Guido van Rossumd8225182000-03-10 22:33:05 +00001198/* --- UTF-16 Codecs ------------------------------------------------------ */
1199
Guido van Rossum9e896b32000-04-05 20:11:21 +00001200/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001201 the corresponding Unicode object.
1202
1203 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001205
1206 If byteorder is non-NULL, the decoder starts decoding using the
1207 given byte order:
1208
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001209 *byteorder == -1: little endian
1210 *byteorder == 0: native order
1211 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001212
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001213 In native mode, the first two bytes of the stream are checked for a
1214 BOM mark. If found, the BOM mark is analysed, the byte order
1215 adjusted and the BOM skipped. In the other modes, no BOM mark
1216 interpretation is done. After completion, *byteorder is set to the
1217 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001218
1219 If byteorder is NULL, the codec starts in native order mode.
1220
1221*/
1222
Mark Hammond91a681d2002-08-12 07:21:58 +00001223PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 const char *string, /* UTF-16 encoded string */
1225 Py_ssize_t length, /* size of string */
1226 const char *errors, /* error handling */
1227 int *byteorder /* pointer to byteorder to use
1228 0=native;-1=LE,1=BE; updated on
1229 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001230 );
1231
Walter Dörwald69652032004-09-07 20:24:22 +00001232PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 const char *string, /* UTF-16 encoded string */
1234 Py_ssize_t length, /* size of string */
1235 const char *errors, /* error handling */
1236 int *byteorder, /* pointer to byteorder to use
1237 0=native;-1=LE,1=BE; updated on
1238 exit */
1239 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001240 );
1241
Guido van Rossumd8225182000-03-10 22:33:05 +00001242/* Returns a Python string using the UTF-16 encoding in native byte
1243 order. The string always starts with a BOM mark. */
1244
Mark Hammond91a681d2002-08-12 07:21:58 +00001245PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001246 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001247 );
1248
1249/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001250 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001251
1252 If byteorder is not 0, output is written according to the following
1253 byte order:
1254
1255 byteorder == -1: little endian
1256 byteorder == 0: native byte order (writes a BOM mark)
1257 byteorder == 1: big endian
1258
1259 If byteorder is 0, the output string will always start with the
1260 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1261 prepended.
1262
1263 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1264 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001265 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001266
1267*/
1268
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001269#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001270PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 const Py_UNICODE *data, /* Unicode char buffer */
1272 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1273 const char *errors, /* error handling */
1274 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001275 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001276#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001277
1278/* --- Unicode-Escape Codecs ---------------------------------------------- */
1279
Mark Hammond91a681d2002-08-12 07:21:58 +00001280PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001281 const char *string, /* Unicode-Escape encoded string */
1282 Py_ssize_t length, /* size of string */
1283 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 );
1285
Mark Hammond91a681d2002-08-12 07:21:58 +00001286PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001288 );
1289
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001290#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001291PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 const Py_UNICODE *data, /* Unicode char buffer */
1293 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001294 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001295#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001296
1297/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1298
Mark Hammond91a681d2002-08-12 07:21:58 +00001299PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 const char *string, /* Raw-Unicode-Escape encoded string */
1301 Py_ssize_t length, /* size of string */
1302 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001303 );
1304
Mark Hammond91a681d2002-08-12 07:21:58 +00001305PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001307 );
1308
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001309#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001310PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 const Py_UNICODE *data, /* Unicode char buffer */
1312 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001313 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001315
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001316/* --- Unicode Internal Codec ---------------------------------------------
1317
1318 Only for internal use in _codecsmodule.c */
1319
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001320#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001321PyObject *_PyUnicode_DecodeUnicodeInternal(
1322 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001323 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001324 const char *errors
1325 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001326#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001327
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001329
1330 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1331
1332*/
1333
Mark Hammond91a681d2002-08-12 07:21:58 +00001334PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001335 const char *string, /* Latin-1 encoded string */
1336 Py_ssize_t length, /* size of string */
1337 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001338 );
1339
Mark Hammond91a681d2002-08-12 07:21:58 +00001340PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001342 );
1343
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001344#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1346 PyObject* unicode,
1347 const char* errors);
1348
Mark Hammond91a681d2002-08-12 07:21:58 +00001349PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 const Py_UNICODE *data, /* Unicode char buffer */
1351 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1352 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001353 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001354#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001357
1358 Only 7-bit ASCII data is excepted. All other codes generate errors.
1359
1360*/
1361
Mark Hammond91a681d2002-08-12 07:21:58 +00001362PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001363 const char *string, /* ASCII encoded string */
1364 Py_ssize_t length, /* size of string */
1365 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001366 );
1367
Mark Hammond91a681d2002-08-12 07:21:58 +00001368PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001370 );
1371
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001372#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1374 PyObject* unicode,
1375 const char* errors);
1376
Mark Hammond91a681d2002-08-12 07:21:58 +00001377PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378 const Py_UNICODE *data, /* Unicode char buffer */
1379 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1380 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001381 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001382#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001383
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001385
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001387
1388 Decoding mappings must map single string characters to single
1389 Unicode characters, integers (which are then interpreted as Unicode
1390 ordinals) or None (meaning "undefined mapping" and causing an
1391 error).
1392
1393 Encoding mappings must map single Unicode characters to single
1394 string characters, integers (which are then interpreted as Latin-1
1395 ordinals) or None (meaning "undefined mapping" and causing an
1396 error).
1397
1398 If a character lookup fails with a LookupError, the character is
1399 copied as-is meaning that its ordinal value will be interpreted as
1400 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1401 to contain those mappings which map characters to different code
1402 points.
1403
1404*/
1405
Mark Hammond91a681d2002-08-12 07:21:58 +00001406PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 const char *string, /* Encoded string */
1408 Py_ssize_t length, /* size of string */
1409 PyObject *mapping, /* character mapping
1410 (char ordinal -> unicode ordinal) */
1411 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001412 );
1413
Mark Hammond91a681d2002-08-12 07:21:58 +00001414PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 PyObject *unicode, /* Unicode object */
1416 PyObject *mapping /* character mapping
1417 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001418 );
1419
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001420#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001421PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 const Py_UNICODE *data, /* Unicode char buffer */
1423 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1424 PyObject *mapping, /* character mapping
1425 (unicode ordinal -> char ordinal) */
1426 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001427 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001428PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1429 PyObject *unicode, /* Unicode object */
1430 PyObject *mapping, /* character mapping
1431 (unicode ordinal -> char ordinal) */
1432 const char *errors /* error handling */
1433 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001434#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001435
1436/* Translate a Py_UNICODE buffer of the given length by applying a
1437 character mapping table to it and return the resulting Unicode
1438 object.
1439
1440 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001442
1443 Mapping tables may be dictionaries or sequences. Unmapped character
1444 ordinals (ones which cause a LookupError) are left untouched and
1445 are copied as-is.
1446
1447*/
1448
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001449#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001450PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 const Py_UNICODE *data, /* Unicode char buffer */
1452 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1453 PyObject *table, /* Translate table */
1454 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001455 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001456#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001457
Victor Stinner99b95382011-07-04 14:23:54 +02001458#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001459
Guido van Rossumefec1152000-03-28 02:01:15 +00001460/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001461
Mark Hammond91a681d2002-08-12 07:21:58 +00001462PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001463 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001464 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001465 const char *errors /* error handling */
1466 );
1467
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001468PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1469 const char *string, /* MBCS encoded string */
1470 Py_ssize_t length, /* size of string */
1471 const char *errors, /* error handling */
1472 Py_ssize_t *consumed /* bytes consumed */
1473 );
1474
Victor Stinner3a50e702011-10-18 21:21:00 +02001475PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1476 int code_page, /* code page number */
1477 const char *string, /* encoded string */
1478 Py_ssize_t length, /* size of string */
1479 const char *errors, /* error handling */
1480 Py_ssize_t *consumed /* bytes consumed */
1481 );
1482
Mark Hammond91a681d2002-08-12 07:21:58 +00001483PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001484 PyObject *unicode /* Unicode object */
1485 );
1486
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001487#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001488PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001489 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001490 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001491 const char *errors /* error handling */
1492 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001493#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001494
Victor Stinner3a50e702011-10-18 21:21:00 +02001495PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1496 int code_page, /* code page number */
1497 PyObject *unicode, /* Unicode object */
1498 const char *errors /* error handling */
1499 );
1500
Victor Stinner99b95382011-07-04 14:23:54 +02001501#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001502
Guido van Rossum9e896b32000-04-05 20:11:21 +00001503/* --- Decimal Encoder ---------------------------------------------------- */
1504
1505/* Takes a Unicode string holding a decimal value and writes it into
1506 an output buffer using standard ASCII digit codes.
1507
1508 The output buffer has to provide at least length+1 bytes of storage
1509 area. The output string is 0-terminated.
1510
1511 The encoder converts whitespace to ' ', decimal characters to their
1512 corresponding ASCII digit and all other Latin-1 characters except
1513 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1514 are treated as errors. This includes embedded NULL bytes.
1515
1516 Error handling is defined by the errors argument:
1517
1518 NULL or "strict": raise a ValueError
1519 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001521 "replace": replaces illegal characters with '?'
1522
1523 Returns 0 on success, -1 on failure.
1524
1525*/
1526
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001527#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001528PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001529 Py_UNICODE *s, /* Unicode buffer */
1530 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1531 char *output, /* Output buffer; must have size >= length */
1532 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001533 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001534#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001535
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001536/* Transforms code points that have decimal digit property to the
1537 corresponding ASCII digit code points.
1538
1539 Returns a new Unicode string on success, NULL on failure.
1540*/
1541
Georg Brandlb5503082010-12-05 11:40:48 +00001542#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001543PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1544 Py_UNICODE *s, /* Unicode buffer */
1545 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1546 );
Georg Brandlb5503082010-12-05 11:40:48 +00001547#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1550 as argument instead of a raw buffer and length. This function additionally
1551 transforms spaces to ASCII because this is what the callers in longobject,
1552 floatobject, and complexobject did anyways. */
1553
1554#ifndef Py_LIMITED_API
1555PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1556 PyObject *unicode /* Unicode object */
1557 );
1558#endif
1559
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560/* --- File system encoding ---------------------------------------------- */
1561
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001562/* ParseTuple converter: encode str objects to bytes using
1563 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001564
1565PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1566
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001567/* ParseTuple converter: decode bytes objects to unicode using
1568 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1569
1570PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1571
Victor Stinner77c38622010-05-14 15:58:55 +00001572/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1573 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001574
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001575 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1576 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001577
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001578 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001579*/
1580
1581PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1582 const char *s /* encoded string */
1583 );
1584
Victor Stinner77c38622010-05-14 15:58:55 +00001585/* Decode a string using Py_FileSystemDefaultEncoding
1586 and the "surrogateescape" error handler.
1587
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001588 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1589 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001590*/
1591
Martin v. Löwis011e8422009-05-05 04:43:17 +00001592PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1593 const char *s, /* encoded string */
1594 Py_ssize_t size /* size */
1595 );
1596
Victor Stinnerae6265f2010-05-15 16:27:27 +00001597/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001598 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001599
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001600 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1601 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001602*/
1603
1604PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1605 PyObject *unicode
1606 );
1607
Guido van Rossumd8225182000-03-10 22:33:05 +00001608/* --- Methods & Slots ----------------------------------------------------
1609
1610 These are capable of handling Unicode objects and strings on input
1611 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001612 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001613
1614/* Concat two strings giving a new Unicode string. */
1615
Mark Hammond91a681d2002-08-12 07:21:58 +00001616PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 PyObject *left, /* Left string */
1618 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001619 );
1620
Walter Dörwald1ab83302007-05-18 17:15:44 +00001621/* Concat two strings and put the result in *pleft
1622 (sets *pleft to NULL on error) */
1623
1624PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001625 PyObject **pleft, /* Pointer to left string */
1626 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001627 );
1628
1629/* Concat two strings, put the result in *pleft and drop the right object
1630 (sets *pleft to NULL on error) */
1631
1632PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001633 PyObject **pleft, /* Pointer to left string */
1634 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001635 );
1636
Guido van Rossumd8225182000-03-10 22:33:05 +00001637/* Split a string giving a list of Unicode strings.
1638
1639 If sep is NULL, splitting will be done at all whitespace
1640 substrings. Otherwise, splits occur at the given separator.
1641
1642 At most maxsplit splits will be done. If negative, no limit is set.
1643
1644 Separators are not included in the resulting list.
1645
1646*/
1647
Mark Hammond91a681d2002-08-12 07:21:58 +00001648PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001649 PyObject *s, /* String to split */
1650 PyObject *sep, /* String separator */
1651 Py_ssize_t maxsplit /* Maxsplit count */
1652 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001653
1654/* Dito, but split at line breaks.
1655
1656 CRLF is considered to be one line break. Line breaks are not
1657 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658
Mark Hammond91a681d2002-08-12 07:21:58 +00001659PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 PyObject *s, /* String to split */
1661 int keepends /* If true, line end markers are included */
1662 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001663
Thomas Wouters477c8d52006-05-27 19:21:47 +00001664/* Partition a string using a given separator. */
1665
1666PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001667 PyObject *s, /* String to partition */
1668 PyObject *sep /* String separator */
1669 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001670
1671/* Partition a string using a given separator, searching from the end of the
1672 string. */
1673
1674PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 PyObject *s, /* String to partition */
1676 PyObject *sep /* String separator */
1677 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001678
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001679/* Split a string giving a list of Unicode strings.
1680
1681 If sep is NULL, splitting will be done at all whitespace
1682 substrings. Otherwise, splits occur at the given separator.
1683
1684 At most maxsplit splits will be done. But unlike PyUnicode_Split
1685 PyUnicode_RSplit splits from the end of the string. If negative,
1686 no limit is set.
1687
1688 Separators are not included in the resulting list.
1689
1690*/
1691
1692PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 PyObject *s, /* String to split */
1694 PyObject *sep, /* String separator */
1695 Py_ssize_t maxsplit /* Maxsplit count */
1696 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001697
Guido van Rossumd8225182000-03-10 22:33:05 +00001698/* Translate a string by applying a character mapping table to it and
1699 return the resulting Unicode object.
1700
1701 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001702 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001703
1704 Mapping tables may be dictionaries or sequences. Unmapped character
1705 ordinals (ones which cause a LookupError) are left untouched and
1706 are copied as-is.
1707
1708*/
1709
Mark Hammond91a681d2002-08-12 07:21:58 +00001710PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 PyObject *str, /* String */
1712 PyObject *table, /* Translate table */
1713 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001714 );
1715
1716/* Join a sequence of strings using the given separator and return
1717 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718
Mark Hammond91a681d2002-08-12 07:21:58 +00001719PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 PyObject *separator, /* Separator string */
1721 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001722 );
1723
1724/* Return 1 if substr matches str[start:end] at the given tail end, 0
1725 otherwise. */
1726
Martin v. Löwis18e16552006-02-15 17:27:45 +00001727PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 PyObject *str, /* String */
1729 PyObject *substr, /* Prefix or Suffix string */
1730 Py_ssize_t start, /* Start index */
1731 Py_ssize_t end, /* Stop index */
1732 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001733 );
1734
1735/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001736 given search direction or -1 if not found. -2 is returned in case
1737 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001738
Martin v. Löwis18e16552006-02-15 17:27:45 +00001739PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 PyObject *str, /* String */
1741 PyObject *substr, /* Substring to find */
1742 Py_ssize_t start, /* Start index */
1743 Py_ssize_t end, /* Stop index */
1744 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001745 );
1746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747/* Like PyUnicode_Find, but search for single character only. */
1748PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1749 PyObject *str,
1750 Py_UCS4 ch,
1751 Py_ssize_t start,
1752 Py_ssize_t end,
1753 int direction
1754 );
1755
Barry Warsaw51ac5802000-03-20 16:36:48 +00001756/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001757
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 PyObject *str, /* String */
1760 PyObject *substr, /* Substring to count */
1761 Py_ssize_t start, /* Start index */
1762 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001763 );
1764
Barry Warsaw51ac5802000-03-20 16:36:48 +00001765/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001766 and return the resulting Unicode object. */
1767
Mark Hammond91a681d2002-08-12 07:21:58 +00001768PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001769 PyObject *str, /* String */
1770 PyObject *substr, /* Substring to find */
1771 PyObject *replstr, /* Substring to replace */
1772 Py_ssize_t maxcount /* Max. number of replacements to apply;
1773 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001774 );
1775
1776/* Compare two strings and return -1, 0, 1 for less than, equal,
1777 greater than resp. */
1778
Mark Hammond91a681d2002-08-12 07:21:58 +00001779PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001780 PyObject *left, /* Left string */
1781 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001782 );
1783
Martin v. Löwis5b222132007-06-10 09:51:05 +00001784PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1785 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001786 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001787 );
1788
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001789/* Rich compare two strings and return one of the following:
1790
1791 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001792 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001793 - Py_NotImplemented in case the type combination is unknown
1794
1795 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1796 case the conversion of the arguments to Unicode fails with a
1797 UnicodeDecodeError.
1798
1799 Possible values for op:
1800
1801 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1802
1803*/
1804
1805PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 PyObject *left, /* Left string */
1807 PyObject *right, /* Right string */
1808 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001809 );
1810
Thomas Wouters7e474022000-07-16 12:04:32 +00001811/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001812 the resulting Unicode string. */
1813
Mark Hammond91a681d2002-08-12 07:21:58 +00001814PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001815 PyObject *format, /* Format string */
1816 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001817 );
1818
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001819/* Checks whether element is contained in container and return 1/0
1820 accordingly.
1821
1822 element has to coerce to an one element Unicode string. -1 is
1823 returned in case of an error. */
1824
Mark Hammond91a681d2002-08-12 07:21:58 +00001825PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001826 PyObject *container, /* Container string */
1827 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001828 );
1829
Martin v. Löwis47383402007-08-15 07:32:56 +00001830/* Checks whether argument is a valid identifier. */
1831
1832PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1833
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001834#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001835/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001836PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001837 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001838 int striptype,
1839 PyObject *sepobj
1840 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001841#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001842
Eric Smith5807c412008-05-11 21:00:57 +00001843/* Using the current locale, insert the thousands grouping
1844 into the string pointed to by buffer. For the argument descriptions,
1845 see Objects/stringlib/localeutil.h */
1846
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001847#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001848PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1849 Py_ssize_t n_buffer,
1850 Py_UNICODE *digits,
1851 Py_ssize_t n_digits,
1852 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001853#endif
Eric Smith5807c412008-05-11 21:00:57 +00001854
Eric Smitha3b1ac82009-04-03 14:45:06 +00001855/* Using explicit passed-in values, insert the thousands grouping
1856 into the string pointed to by buffer. For the argument descriptions,
1857 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001858#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001860 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 int kind,
1862 void *buffer,
1863 Py_ssize_t n_buffer,
1864 void *digits,
1865 Py_ssize_t n_digits,
1866 Py_ssize_t min_width,
1867 const char *grouping,
1868 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001869#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001870/* === Characters Type APIs =============================================== */
1871
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001872/* Helper array used by Py_UNICODE_ISSPACE(). */
1873
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001874#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001875PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1876
Guido van Rossumd8225182000-03-10 22:33:05 +00001877/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001878 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001879
1880 These APIs are implemented in Objects/unicodectype.c.
1881
1882*/
1883
Mark Hammond91a681d2002-08-12 07:21:58 +00001884PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001885 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001886 );
1887
Mark Hammond91a681d2002-08-12 07:21:58 +00001888PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001889 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001890 );
1891
Mark Hammond91a681d2002-08-12 07:21:58 +00001892PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001893 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001894 );
1895
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001896PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001897 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001898 );
1899
1900PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001901 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001902 );
1903
Mark Hammond91a681d2002-08-12 07:21:58 +00001904PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001905 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001906 );
1907
Mark Hammond91a681d2002-08-12 07:21:58 +00001908PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001909 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001910 );
1911
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001912PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1913 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001914 );
1915
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001916PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1917 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001918 );
1919
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001920PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1921 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001922 );
1923
Mark Hammond91a681d2002-08-12 07:21:58 +00001924PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001925 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001926 );
1927
Mark Hammond91a681d2002-08-12 07:21:58 +00001928PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001929 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001930 );
1931
Mark Hammond91a681d2002-08-12 07:21:58 +00001932PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001933 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001934 );
1935
Mark Hammond91a681d2002-08-12 07:21:58 +00001936PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001937 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001938 );
1939
Mark Hammond91a681d2002-08-12 07:21:58 +00001940PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001941 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001942 );
1943
Mark Hammond91a681d2002-08-12 07:21:58 +00001944PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001945 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001946 );
1947
Georg Brandl559e5d72008-06-11 18:37:52 +00001948PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001949 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001950 );
1951
Mark Hammond91a681d2002-08-12 07:21:58 +00001952PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001953 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001954 );
1955
Victor Stinneref8d95c2010-08-16 22:03:11 +00001956PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1957 const Py_UNICODE *u
1958 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001959
1960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001961 Py_UNICODE *s1,
1962 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001963
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001964PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1965 Py_UNICODE *s1, const Py_UNICODE *s2);
1966
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001968 Py_UNICODE *s1,
1969 const Py_UNICODE *s2,
1970 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001971
1972PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001973 const Py_UNICODE *s1,
1974 const Py_UNICODE *s2
1975 );
1976
1977PyAPI_FUNC(int) Py_UNICODE_strncmp(
1978 const Py_UNICODE *s1,
1979 const Py_UNICODE *s2,
1980 size_t n
1981 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001982
1983PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001984 const Py_UNICODE *s,
1985 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001986 );
1987
Victor Stinner331ea922010-08-10 16:37:20 +00001988PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001989 const Py_UNICODE *s,
1990 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001991 );
1992
Victor Stinner71133ff2010-09-01 23:43:53 +00001993/* Create a copy of a unicode string ending with a nul character. Return NULL
1994 and raise a MemoryError exception on memory allocation failure, otherwise
1995 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1996
Victor Stinner46408602010-09-03 16:18:00 +00001997PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001998 PyObject *unicode
1999 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002000#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002001
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002002#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2003/* FIXME: use PyObject* type for op */
2004PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2005 void *op,
2006 int check_content);
2007#endif
2008
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002009/********************* String Literals ****************************************/
2010/* This structure helps managing static strings. The basic usage goes like this:
2011 Instead of doing
2012
2013 r = PyObject_CallMethod(o, "foo", "args", ...);
2014
2015 do
2016
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002017 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002018 ...
2019 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2020
2021 PyId_foo is a static variable, either on block level or file level. On first
2022 usage, the string "foo" is interned, and the structures are linked. On interpreter
2023 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2024
2025 Alternatively, _Py_static_string allows to choose the variable name.
2026 _PyUnicode_FromId returns a new reference to the interned string.
2027 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2028*/
2029typedef struct _Py_Identifier {
2030 struct _Py_Identifier *next;
2031 const char* string;
2032 PyObject *object;
2033} _Py_Identifier;
2034
Martin v. Löwis87da8722011-10-09 11:54:42 +02002035#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002036#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002037
2038/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2039PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2040/* Clear all static strings. */
2041PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2042
Guido van Rossumd8225182000-03-10 22:33:05 +00002043#ifdef __cplusplus
2044}
2045#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002046#endif /* !Py_UNICODEOBJECT_H */