blob: 60bfcbe5ca9cad94ba546d2f373306f7c410fe14 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100223 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200224 * kind = PyUnicode_1BYTE_KIND
225 * compact = 1
226 * ascii = 1
227 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200228 * (length is the length of the utf8 and wstr strings)
229 * (data starts just after the structure)
230 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200231
232 - compact:
233
234 * structure = PyCompactUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100235 * test: PyUnicode_IS_ASCII(op) && !PyUnicode_IS_COMPACT(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237 PyUnicode_4BYTE_KIND
238 * compact = 1
239 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200242 * utf8_length = 0 if utf8 is NULL
243 * wstr is shared with data and wstr_length=length
244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200246 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200247 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100253 * length = 0 (use wstr_length)
254 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * kind = PyUnicode_WCHAR_KIND
256 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200257 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200258 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100259 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200260 * wstr is not NULL
261 * data.any is NULL
262 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200263 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200266
267 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270 PyUnicode_4BYTE_KIND
271 * compact = 0
272 * ready = 1
273 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200274 * utf8 is shared and utf8_length = length with data.any if ascii = 1
275 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100276 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Compact strings use only one memory block (structure + characters),
282 whereas legacy strings use one block for the structure and one block
283 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200284
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200285 Legacy strings are created by PyUnicode_FromUnicode() and
286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287 when PyUnicode_READY() is called.
288
289 See also _PyUnicode_CheckConsistency().
290 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000293 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 struct {
295 /*
296 SSTATE_NOT_INTERNED (0)
297 SSTATE_INTERNED_MORTAL (1)
298 SSTATE_INTERNED_IMMORTAL (2)
299
300 If interned != SSTATE_NOT_INTERNED, the two references from the
301 dictionary to this object are *not* counted in ob_refcnt.
302 */
303 unsigned int interned:2;
304 /* Character size:
305
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200306 - PyUnicode_WCHAR_KIND (0):
307
308 * character type = wchar_t (16 or 32 bits, depending on the
309 platform)
310
311 - PyUnicode_1BYTE_KIND (1):
312
313 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+00FF (latin1)
315 * if ascii is set, all characters are in the range U+0000-U+007F
316 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200317 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 - PyUnicode_2BYTE_KIND (2):
320
321 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100322 * all characters are in the range U+0000-U+FFFF (BMP)
323 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200324
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200325 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200326
327 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100328 * all characters are in the range U+0000-U+10FFFF
329 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200331 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 /* Compact is with respect to the allocation scheme. Compact unicode
333 objects only require one memory block while non-compact objects use
334 one block for the PyUnicodeObject struct and another for its data
335 buffer. */
336 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100337 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 unsigned int ascii:1;
341 /* The ready flag indicates whether the object layout is initialized
342 completely. This means that this is either a compact object, or
343 the data pointer is filled out. The bit is redundant, and helps
344 to minimize the test in PyUnicode_IS_READY(). */
345 unsigned int ready:1;
346 } state;
347 wchar_t *wstr; /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200351 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200352 immediately follow the structure. */
353typedef struct {
354 PyASCIIObject _base;
355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
356 * terminating \0. */
357 char *utf8; /* UTF-8 representation (null-terminated) */
358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
359 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200364 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365typedef struct {
366 PyCompactUnicodeObject _base;
367 union {
368 void *any;
369 Py_UCS1 *latin1;
370 Py_UCS2 *ucs2;
371 Py_UCS4 *ucs4;
372 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000373} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000374#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000375
Mark Hammond91a681d2002-08-12 07:21:58 +0000376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
Thomas Wouters27d517b2007-02-25 20:39:11 +0000379#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000382
383/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000384#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200385
386#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200387 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 ((PyASCIIObject*)op)->length : \
389 ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392 (this includes surrogate pairs as 2 units).
393 If the Py_UNICODE representation is not available, it will be computed
394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
395
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100396#define PyUnicode_GET_SIZE(op) \
397 (assert(PyUnicode_Check(op)), \
398 (((PyASCIIObject *)(op))->wstr) ? \
399 PyUnicode_WSTR_LENGTH(op) : \
400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
401 assert(((PyASCIIObject *)(op))->wstr), \
402 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
408 representation on demand. Using this macro is very inefficient now,
409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410 use PyUnicode_WRITE() and PyUnicode_READ(). */
411
Guido van Rossumd8225182000-03-10 22:33:05 +0000412#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413 (assert(PyUnicode_Check(op)), \
414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415 PyUnicode_AsUnicode((PyObject *)(op)))
416
Guido van Rossumd8225182000-03-10 22:33:05 +0000417#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418 ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200422
Victor Stinner6f9568b2011-11-17 00:12:44 +0100423/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
Victor Stinnera3b334d2011-10-03 13:53:37 +0200430/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432 ready. */
433#define PyUnicode_IS_ASCII(op) \
434 (assert(PyUnicode_Check(op)), \
435 assert(PyUnicode_IS_READY(op)), \
436 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200437
438/* Return true if the string is compact or 0 if not.
439 No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441 (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444 structure), or 0 if not. No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447
448/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200449 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 has not been called yet. */
Victor Stinner0ba5af22011-12-17 22:18:27 +0100451#define PyUnicode_WCHAR_KIND 0
452
453/* Return values of the PyUnicode_KIND() macro */
454enum PyUnicode_Kind {
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200455 PyUnicode_1BYTE_KIND = 1,
456 PyUnicode_2BYTE_KIND = 2,
457 PyUnicode_4BYTE_KIND = 4
458};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459
Georg Brandl4975a9b2011-10-05 16:12:21 +0200460/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200462 No checks are performed, use PyUnicode_KIND() before to ensure
463 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464
465#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
466#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
467#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
468
Victor Stinner157f83f2011-09-28 21:41:31 +0200469/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200470#define PyUnicode_KIND(op) \
471 (assert(PyUnicode_Check(op)), \
472 assert(PyUnicode_IS_READY(op)), \
473 ((PyASCIIObject *)(op))->state.kind)
474
Victor Stinner157f83f2011-09-28 21:41:31 +0200475/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200477 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478 ((void*)((PyASCIIObject*)(op) + 1)) : \
479 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
480
481#define _PyUnicode_NONCOMPACT_DATA(op) \
482 (assert(((PyUnicodeObject*)(op))->data.any), \
483 ((((PyUnicodeObject *)(op))->data.any)))
484
485#define PyUnicode_DATA(op) \
486 (assert(PyUnicode_Check(op)), \
487 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
488 _PyUnicode_NONCOMPACT_DATA(op))
489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490/* In the access macros below, "kind" may be evaluated more than once.
491 All other macro parameters are evaluated exactly once, so it is safe
492 to put side effects into them (such as increasing the index). */
493
494/* Write into the canonical representation, this macro does not do any sanity
495 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200496 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200497 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200498 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499#define PyUnicode_WRITE(kind, data, index, value) \
500 do { \
501 switch ((kind)) { \
502 case PyUnicode_1BYTE_KIND: { \
503 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
504 break; \
505 } \
506 case PyUnicode_2BYTE_KIND: { \
507 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
508 break; \
509 } \
510 default: { \
511 assert((kind) == PyUnicode_4BYTE_KIND); \
512 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
513 } \
514 } \
515 } while (0)
516
Georg Brandl07de3252011-10-05 16:47:38 +0200517/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518 or ready calls are performed. */
519#define PyUnicode_READ(kind, data, index) \
520 ((Py_UCS4) \
521 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200522 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523 ((kind) == PyUnicode_2BYTE_KIND ? \
524 ((const Py_UCS2 *)(data))[(index)] : \
525 ((const Py_UCS4 *)(data))[(index)] \
526 ) \
527 ))
528
529/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
530 calls PyUnicode_KIND() and might call it twice. For single reads, use
531 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
532 cache kind and use PyUnicode_READ instead. */
533#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200534 (assert(PyUnicode_Check(unicode)), \
535 assert(PyUnicode_IS_READY(unicode)), \
536 (Py_UCS4) \
537 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
538 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
539 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
540 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
541 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
542 ) \
543 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200544
545/* Returns the length of the unicode string. The caller has to make sure that
546 the string has it's canonical representation set before calling
547 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
548#define PyUnicode_GET_LENGTH(op) \
549 (assert(PyUnicode_Check(op)), \
550 assert(PyUnicode_IS_READY(op)), \
551 ((PyASCIIObject *)(op))->length)
552
553
554/* Fast check to determine whether an object is ready. Equivalent to
555 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
556
557#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
558
Victor Stinnera3b334d2011-10-03 13:53:37 +0200559/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200561 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562 Returns 0 on success and -1 on errors. */
563#define PyUnicode_READY(op) \
564 (assert(PyUnicode_Check(op)), \
565 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200566 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200568/* Return a maximum character value which is suitable for creating another
569 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200570 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200571#define PyUnicode_MAX_CHAR_VALUE(op) \
572 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200573 (PyUnicode_IS_ASCII(op) ? \
574 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200576 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200578 (0xffffU) : \
579 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200580
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000581#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000582
583/* --- Constants ---------------------------------------------------------- */
584
585/* This Unicode character will be used as replacement character during
586 decoding if the errors argument is set to "replace". Note: the
587 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
588 Unicode 3.0. */
589
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200590#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000591
592/* === Public API ========================================================= */
593
594/* --- Plain Py_UNICODE --------------------------------------------------- */
595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596/* With PEP 393, this is the recommended way to allocate a new unicode object.
597 This function will allocate the object and its buffer in a single memory
598 block. Objects created using this function are not resizable. */
599#ifndef Py_LIMITED_API
600PyAPI_FUNC(PyObject*) PyUnicode_New(
601 Py_ssize_t size, /* Number of code points in the new string */
602 Py_UCS4 maxchar /* maximum code point value in the string */
603 );
604#endif
605
Victor Stinnerd8f65102011-09-29 19:43:17 +0200606/* Initializes the canonical string representation from a the deprecated
607 wstr/Py_UNICODE representation. This function is used to convert Unicode
608 objects which were created using the old API to the new flexible format
609 introduced with PEP 393.
610
611 Don't call this function directly, use the public PyUnicode_READY() macro
612 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200613#ifndef Py_LIMITED_API
614PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200615 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616 );
617#endif
618
Victor Stinner034f6cf2011-09-30 02:26:44 +0200619/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100620#ifndef Py_LIMITED_API
621PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200622 PyObject *unicode
623 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100624#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200627 character conversion when necessary and falls back to memcpy if possible.
628
Victor Stinnera0702ab2011-09-29 14:14:38 +0200629 Fail if to is too small (smaller than how_many or smaller than
630 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
631 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200632
633 Return the number of written character, or return -1 and raise an exception
634 on error.
635
636 Pseudo-code:
637
638 how_many = min(how_many, len(from) - from_start)
639 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
640 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200641
642 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200643 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200645PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646 PyObject *to,
647 Py_ssize_t to_start,
648 PyObject *from,
649 Py_ssize_t from_start,
650 Py_ssize_t how_many
651 );
652#endif
653
Guido van Rossumd8225182000-03-10 22:33:05 +0000654/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000656
657 u may be NULL which causes the contents to be undefined. It is the
658 user's responsibility to fill in the needed data afterwards. Note
659 that modifying the Unicode object contents after construction is
660 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000661
662 The buffer is copied into the new object. */
663
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000664#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000665PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000666 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000667 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000668 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000669#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000670
Georg Brandl952867a2010-06-27 10:17:12 +0000671/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000673 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000674 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000675 );
676
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000677/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200678 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000679PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000680 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000681 );
682
Victor Stinnerb9275c12011-10-05 14:01:42 +0200683/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
684 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685#ifndef Py_LIMITED_API
686PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
687 int kind,
688 const void *buffer,
689 Py_ssize_t size);
690#endif
691
692PyAPI_FUNC(PyObject*) PyUnicode_Substring(
693 PyObject *str,
694 Py_ssize_t start,
695 Py_ssize_t end);
696
Georg Brandldb6c7f52011-10-07 11:19:11 +0200697/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200698 is set. Return NULL and raise an exception on error. Raise a ValueError if
699 the buffer is smaller than the string. Return buffer on success.
700
701 buflen is the length of the buffer in (Py_UCS4) characters. */
702PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
703 PyObject *unicode,
704 Py_UCS4* buffer,
705 Py_ssize_t buflen,
706 int copy_null);
707
708/* Copy the string into a UCS4 buffer. A new buffer is allocated using
709 * PyMem_Malloc; if this fails, NULL is returned with a memory error
710 exception set. */
711PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
712
Guido van Rossumd8225182000-03-10 22:33:05 +0000713/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714 Py_UNICODE buffer.
715 If the wchar_t/Py_UNICODE representation is not yet available, this
716 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000717
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000718#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000719PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000721 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000722#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724/* Return a read-only pointer to the Unicode object's internal
725 Py_UNICODE buffer and save the length at size.
726 If the wchar_t/Py_UNICODE representation is not yet available, this
727 function will calculate it. */
728
729#ifndef Py_LIMITED_API
730PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
731 PyObject *unicode, /* Unicode object */
732 Py_ssize_t *size /* location where to save the length */
733 );
734#endif
735
Guido van Rossumd8225182000-03-10 22:33:05 +0000736/* Get the length of the Unicode object. */
737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
739 PyObject *unicode
740);
741
Victor Stinner157f83f2011-09-28 21:41:31 +0200742/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200743 string representation. */
744
Martin v. Löwis18e16552006-02-15 17:27:45 +0000745PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000747 );
748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749/* Read a character from the string. */
750
751PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
752 PyObject *unicode,
753 Py_ssize_t index
754 );
755
756/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200757 PyUnicode_New, must not be shared, and must not have been hashed yet.
758
759 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760
761PyAPI_FUNC(int) PyUnicode_WriteChar(
762 PyObject *unicode,
763 Py_ssize_t index,
764 Py_UCS4 character
765 );
766
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000767#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000768/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000769PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000770#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000771
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100772/* Resize an Unicode object. The length is the number of characters, except
773 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
774 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000775
776 *unicode is modified to point to the new (resized) object and 0
777 returned on success.
778
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100779 Try to resize the string in place (which is usually faster than allocating
780 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000781
782 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100783 is returned and *unicode left untouched.
784
785 WARNING: The function doesn't check string content, the result may not be a
786 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000787
Mark Hammond91a681d2002-08-12 07:21:58 +0000788PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 PyObject **unicode, /* Pointer to the Unicode object */
790 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000791 );
792
Guido van Rossumd8225182000-03-10 22:33:05 +0000793/* Coerce obj to an Unicode object and return a reference with
794 *incremented* refcount.
795
796 Coercion is done in the following way:
797
Georg Brandl952867a2010-06-27 10:17:12 +0000798 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000799 under the assumptions that they contain data using the UTF-8
800 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000801
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000802 2. All other objects (including Unicode objects) raise an
803 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000804
805 The API returns NULL in case of an error. The caller is responsible
806 for decref'ing the returned objects.
807
808*/
809
Mark Hammond91a681d2002-08-12 07:21:58 +0000810PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000812 const char *encoding, /* encoding */
813 const char *errors /* error handling */
814 );
815
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000816/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000817 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000819 Unicode objects are passed back as-is (subclasses are converted to
820 true Unicode objects), all other objects are delegated to
821 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000822 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000823
824 The API returns NULL in case of an error. The caller is responsible
825 for decref'ing the returned objects.
826
827*/
828
Mark Hammond91a681d2002-08-12 07:21:58 +0000829PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000831 );
832
Victor Stinner1205f272010-09-11 00:54:47 +0000833PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
834 const char *format, /* ASCII-encoded string */
835 va_list vargs
836 );
837PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
838 const char *format, /* ASCII-encoded string */
839 ...
840 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000842#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000843/* Format the object based on the format_spec, as defined in PEP 3101
844 (Advanced String Formatting). */
845PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 PyObject *format_spec,
847 Py_ssize_t start,
848 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000849#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000850
Walter Dörwald16807132007-05-25 13:52:07 +0000851PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
852PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000853PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
854 const char *u /* UTF-8 encoded string */
855 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000856#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000857PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000858#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000859
860/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861#define PyUnicode_CHECK_INTERNED(op) \
862 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000863
Guido van Rossumd8225182000-03-10 22:33:05 +0000864/* --- wchar_t support for platforms which support it --------------------- */
865
866#ifdef HAVE_WCHAR_H
867
Georg Brandl952867a2010-06-27 10:17:12 +0000868/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000869 size.
870
871 The buffer is copied into the new object. */
872
Mark Hammond91a681d2002-08-12 07:21:58 +0000873PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000874 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000875 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000876 );
877
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000878/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000879 most size wchar_t characters are copied.
880
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000881 Note that the resulting wchar_t string may or may not be
882 0-terminated. It is the responsibility of the caller to make sure
883 that the wchar_t string is 0-terminated in case this is required by
884 the application.
885
886 Returns the number of wchar_t characters copied (excluding a
887 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000888 error. */
889
Martin v. Löwis18e16552006-02-15 17:27:45 +0000890PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000891 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000892 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000893 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000894 );
895
Victor Stinner137c34c2010-09-29 10:25:54 +0000896/* Convert the Unicode object to a wide character string. The output string
897 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200898 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000899
900 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
901 on success. On error, returns NULL, *size is undefined and raises a
902 MemoryError. */
903
904PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000905 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000906 Py_ssize_t *size /* number of characters of the result */
907 );
908
Victor Stinner9f789e72011-10-01 03:57:28 +0200909#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200911#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
Guido van Rossumd8225182000-03-10 22:33:05 +0000913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915/* --- Unicode ordinals --------------------------------------------------- */
916
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000917/* Create a Unicode Object from the given Unicode code point ordinal.
918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 The ordinal must be in range(0x10000) on narrow Python builds
920 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
921 raised in case it is not.
922
923*/
924
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000925PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000926
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000927/* --- Free-list management ----------------------------------------------- */
928
929/* Clear the free list used by the Unicode implementation.
930
931 This can be used to release memory used for objects on the free
932 list back to the Python memory allocator.
933
934*/
935
936PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
937
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000938/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000939
940 Many of these APIs take two arguments encoding and errors. These
941 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000942 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000943
Georg Brandl952867a2010-06-27 10:17:12 +0000944 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000945
946 Error handling is set by errors which may also be set to NULL
947 meaning to use the default handling defined for the codec. Default
948 error handling for all builtin codecs is "strict" (ValueErrors are
949 raised).
950
951 The codecs all use a similar interface. Only deviation from the
952 generic ones are documented.
953
954*/
955
Fred Drakecb093fe2000-05-09 19:51:53 +0000956/* --- Manage the default encoding ---------------------------------------- */
957
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000958/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000959 Unicode object unicode and the size of the encoded representation
960 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000961
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000962 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000963
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200964 This function caches the UTF-8 encoded string in the unicodeobject
965 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 when the unicodeobject is deallocated.
967
968 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
969 support the previous internal function with the same behaviour.
970
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000971 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000972 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000973
974 *** If you need to access the Unicode object as UTF-8 bytes string,
975 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000976*/
977
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000978#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000980 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000981 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000983#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000984
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000985/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000986 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
989 in the unicodeobject.
990
991 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
992 support the previous internal function with the same behaviour.
993
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000994 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000995 extracted from the returned data.
996
997 *** This API is for interpreter INTERNAL USE ONLY and will likely
998 *** be removed or changed for Python 3.1.
999
1000 *** If you need to access the Unicode object as UTF-8 bytes string,
1001 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001002
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001003*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001004
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001005#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1007#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001008#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001009
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001010/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001011
Mark Hammond91a681d2002-08-12 07:21:58 +00001012PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001013
Guido van Rossumd8225182000-03-10 22:33:05 +00001014/* --- Generic Codecs ----------------------------------------------------- */
1015
1016/* Create a Unicode object by decoding the encoded string s of the
1017 given size. */
1018
Mark Hammond91a681d2002-08-12 07:21:58 +00001019PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001020 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001021 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001022 const char *encoding, /* encoding */
1023 const char *errors /* error handling */
1024 );
1025
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001026/* Decode a Unicode object unicode and return the result as Python
1027 object. */
1028
1029PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 PyObject *unicode, /* Unicode object */
1031 const char *encoding, /* encoding */
1032 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001033 );
1034
1035/* Decode a Unicode object unicode and return the result as Unicode
1036 object. */
1037
1038PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001039 PyObject *unicode, /* Unicode object */
1040 const char *encoding, /* encoding */
1041 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001042 );
1043
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001044/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001045 Python string object. */
1046
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001047#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001048PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001049 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001050 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001051 const char *encoding, /* encoding */
1052 const char *errors /* error handling */
1053 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001054#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001055
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001056/* Encodes a Unicode object and returns the result as Python
1057 object. */
1058
1059PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 PyObject *unicode, /* Unicode object */
1061 const char *encoding, /* encoding */
1062 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001063 );
1064
Guido van Rossumd8225182000-03-10 22:33:05 +00001065/* Encodes a Unicode object and returns the result as Python string
1066 object. */
1067
Mark Hammond91a681d2002-08-12 07:21:58 +00001068PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 PyObject *unicode, /* Unicode object */
1070 const char *encoding, /* encoding */
1071 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001072 );
1073
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001074/* Encodes a Unicode object and returns the result as Unicode
1075 object. */
1076
1077PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 PyObject *unicode, /* Unicode object */
1079 const char *encoding, /* encoding */
1080 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001081 );
1082
1083/* Build an encoding map. */
1084
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001085PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1086 PyObject* string /* 256 character map */
1087 );
1088
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089/* --- UTF-7 Codecs ------------------------------------------------------- */
1090
Mark Hammond91a681d2002-08-12 07:21:58 +00001091PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 const char *string, /* UTF-7 encoded string */
1093 Py_ssize_t length, /* size of string */
1094 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 );
1096
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001097PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 const char *string, /* UTF-7 encoded string */
1099 Py_ssize_t length, /* size of string */
1100 const char *errors, /* error handling */
1101 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001102 );
1103
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001104#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001105PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 const Py_UNICODE *data, /* Unicode char buffer */
1107 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1108 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1109 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1110 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001111 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001112PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1113 PyObject *unicode, /* Unicode object */
1114 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1115 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1116 const char *errors /* error handling */
1117 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001118#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119
Guido van Rossumd8225182000-03-10 22:33:05 +00001120/* --- UTF-8 Codecs ------------------------------------------------------- */
1121
Mark Hammond91a681d2002-08-12 07:21:58 +00001122PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 const char *string, /* UTF-8 encoded string */
1124 Py_ssize_t length, /* size of string */
1125 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001126 );
1127
Walter Dörwald69652032004-09-07 20:24:22 +00001128PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 const char *string, /* UTF-8 encoded string */
1130 Py_ssize_t length, /* size of string */
1131 const char *errors, /* error handling */
1132 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001133 );
1134
Mark Hammond91a681d2002-08-12 07:21:58 +00001135PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001137 );
1138
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001139#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1141 PyObject *unicode,
1142 const char *errors);
1143
Mark Hammond91a681d2002-08-12 07:21:58 +00001144PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 const Py_UNICODE *data, /* Unicode char buffer */
1146 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1147 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001148 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001149#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001150
Walter Dörwald41980ca2007-08-16 21:55:45 +00001151/* --- UTF-32 Codecs ------------------------------------------------------ */
1152
1153/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1154 the corresponding Unicode object.
1155
1156 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001157 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001158
1159 If byteorder is non-NULL, the decoder starts decoding using the
1160 given byte order:
1161
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 *byteorder == -1: little endian
1163 *byteorder == 0: native order
1164 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001165
1166 In native mode, the first four bytes of the stream are checked for a
1167 BOM mark. If found, the BOM mark is analysed, the byte order
1168 adjusted and the BOM skipped. In the other modes, no BOM mark
1169 interpretation is done. After completion, *byteorder is set to the
1170 current byte order at the end of input data.
1171
1172 If byteorder is NULL, the codec starts in native order mode.
1173
1174*/
1175
1176PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 const char *string, /* UTF-32 encoded string */
1178 Py_ssize_t length, /* size of string */
1179 const char *errors, /* error handling */
1180 int *byteorder /* pointer to byteorder to use
1181 0=native;-1=LE,1=BE; updated on
1182 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001183 );
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 const char *string, /* UTF-32 encoded string */
1187 Py_ssize_t length, /* size of string */
1188 const char *errors, /* error handling */
1189 int *byteorder, /* pointer to byteorder to use
1190 0=native;-1=LE,1=BE; updated on
1191 exit */
1192 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001193 );
1194
1195/* Returns a Python string using the UTF-32 encoding in native byte
1196 order. The string always starts with a BOM mark. */
1197
1198PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001199 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001200 );
1201
1202/* Returns a Python string object holding the UTF-32 encoded value of
1203 the Unicode data.
1204
1205 If byteorder is not 0, output is written according to the following
1206 byte order:
1207
1208 byteorder == -1: little endian
1209 byteorder == 0: native byte order (writes a BOM mark)
1210 byteorder == 1: big endian
1211
1212 If byteorder is 0, the output string will always start with the
1213 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1214 prepended.
1215
1216*/
1217
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001218#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001219PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 const Py_UNICODE *data, /* Unicode char buffer */
1221 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1222 const char *errors, /* error handling */
1223 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001224 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001225PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1226 PyObject *object, /* Unicode object */
1227 const char *errors, /* error handling */
1228 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1229 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001230#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001231
Guido van Rossumd8225182000-03-10 22:33:05 +00001232/* --- UTF-16 Codecs ------------------------------------------------------ */
1233
Guido van Rossum9e896b32000-04-05 20:11:21 +00001234/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001235 the corresponding Unicode object.
1236
1237 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
1240 If byteorder is non-NULL, the decoder starts decoding using the
1241 given byte order:
1242
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 *byteorder == -1: little endian
1244 *byteorder == 0: native order
1245 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001246
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001247 In native mode, the first two bytes of the stream are checked for a
1248 BOM mark. If found, the BOM mark is analysed, the byte order
1249 adjusted and the BOM skipped. In the other modes, no BOM mark
1250 interpretation is done. After completion, *byteorder is set to the
1251 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001252
1253 If byteorder is NULL, the codec starts in native order mode.
1254
1255*/
1256
Mark Hammond91a681d2002-08-12 07:21:58 +00001257PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 const char *string, /* UTF-16 encoded string */
1259 Py_ssize_t length, /* size of string */
1260 const char *errors, /* error handling */
1261 int *byteorder /* pointer to byteorder to use
1262 0=native;-1=LE,1=BE; updated on
1263 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001264 );
1265
Walter Dörwald69652032004-09-07 20:24:22 +00001266PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 const char *string, /* UTF-16 encoded string */
1268 Py_ssize_t length, /* size of string */
1269 const char *errors, /* error handling */
1270 int *byteorder, /* pointer to byteorder to use
1271 0=native;-1=LE,1=BE; updated on
1272 exit */
1273 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001274 );
1275
Guido van Rossumd8225182000-03-10 22:33:05 +00001276/* Returns a Python string using the UTF-16 encoding in native byte
1277 order. The string always starts with a BOM mark. */
1278
Mark Hammond91a681d2002-08-12 07:21:58 +00001279PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001281 );
1282
1283/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001284 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001285
1286 If byteorder is not 0, output is written according to the following
1287 byte order:
1288
1289 byteorder == -1: little endian
1290 byteorder == 0: native byte order (writes a BOM mark)
1291 byteorder == 1: big endian
1292
1293 If byteorder is 0, the output string will always start with the
1294 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1295 prepended.
1296
1297 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1298 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001299 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001300
1301*/
1302
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001303#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001304PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 const Py_UNICODE *data, /* Unicode char buffer */
1306 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1307 const char *errors, /* error handling */
1308 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001310PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1311 PyObject* unicode, /* Unicode object */
1312 const char *errors, /* error handling */
1313 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1314 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001315#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001316
1317/* --- Unicode-Escape Codecs ---------------------------------------------- */
1318
Mark Hammond91a681d2002-08-12 07:21:58 +00001319PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 const char *string, /* Unicode-Escape encoded string */
1321 Py_ssize_t length, /* size of string */
1322 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001323 );
1324
Mark Hammond91a681d2002-08-12 07:21:58 +00001325PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001327 );
1328
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001329#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001330PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331 const Py_UNICODE *data, /* Unicode char buffer */
1332 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001333 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001334#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001335
1336/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1337
Mark Hammond91a681d2002-08-12 07:21:58 +00001338PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 const char *string, /* Raw-Unicode-Escape encoded string */
1340 Py_ssize_t length, /* size of string */
1341 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001342 );
1343
Mark Hammond91a681d2002-08-12 07:21:58 +00001344PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001346 );
1347
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001348#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001349PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 const Py_UNICODE *data, /* Unicode char buffer */
1351 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001352 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001353#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001354
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001355/* --- Unicode Internal Codec ---------------------------------------------
1356
1357 Only for internal use in _codecsmodule.c */
1358
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001359#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001360PyObject *_PyUnicode_DecodeUnicodeInternal(
1361 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001363 const char *errors
1364 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001365#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001366
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001368
1369 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1370
1371*/
1372
Mark Hammond91a681d2002-08-12 07:21:58 +00001373PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 const char *string, /* Latin-1 encoded string */
1375 Py_ssize_t length, /* size of string */
1376 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001377 );
1378
Mark Hammond91a681d2002-08-12 07:21:58 +00001379PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001381 );
1382
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001383#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1385 PyObject* unicode,
1386 const char* errors);
1387
Mark Hammond91a681d2002-08-12 07:21:58 +00001388PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 const Py_UNICODE *data, /* Unicode char buffer */
1390 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1391 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001392 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001393#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001394
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001396
1397 Only 7-bit ASCII data is excepted. All other codes generate errors.
1398
1399*/
1400
Mark Hammond91a681d2002-08-12 07:21:58 +00001401PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 const char *string, /* ASCII encoded string */
1403 Py_ssize_t length, /* size of string */
1404 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001405 );
1406
Mark Hammond91a681d2002-08-12 07:21:58 +00001407PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001409 );
1410
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001411#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1413 PyObject* unicode,
1414 const char* errors);
1415
Mark Hammond91a681d2002-08-12 07:21:58 +00001416PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 const Py_UNICODE *data, /* Unicode char buffer */
1418 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1419 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001420 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001421#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001422
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001424
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001426
1427 Decoding mappings must map single string characters to single
1428 Unicode characters, integers (which are then interpreted as Unicode
1429 ordinals) or None (meaning "undefined mapping" and causing an
1430 error).
1431
1432 Encoding mappings must map single Unicode characters to single
1433 string characters, integers (which are then interpreted as Latin-1
1434 ordinals) or None (meaning "undefined mapping" and causing an
1435 error).
1436
1437 If a character lookup fails with a LookupError, the character is
1438 copied as-is meaning that its ordinal value will be interpreted as
1439 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1440 to contain those mappings which map characters to different code
1441 points.
1442
1443*/
1444
Mark Hammond91a681d2002-08-12 07:21:58 +00001445PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001446 const char *string, /* Encoded string */
1447 Py_ssize_t length, /* size of string */
1448 PyObject *mapping, /* character mapping
1449 (char ordinal -> unicode ordinal) */
1450 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001451 );
1452
Mark Hammond91a681d2002-08-12 07:21:58 +00001453PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 PyObject *unicode, /* Unicode object */
1455 PyObject *mapping /* character mapping
1456 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001457 );
1458
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001459#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001460PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001461 const Py_UNICODE *data, /* Unicode char buffer */
1462 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1463 PyObject *mapping, /* character mapping
1464 (unicode ordinal -> char ordinal) */
1465 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001466 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001467PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1468 PyObject *unicode, /* Unicode object */
1469 PyObject *mapping, /* character mapping
1470 (unicode ordinal -> char ordinal) */
1471 const char *errors /* error handling */
1472 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001473#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001474
1475/* Translate a Py_UNICODE buffer of the given length by applying a
1476 character mapping table to it and return the resulting Unicode
1477 object.
1478
1479 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001481
1482 Mapping tables may be dictionaries or sequences. Unmapped character
1483 ordinals (ones which cause a LookupError) are left untouched and
1484 are copied as-is.
1485
1486*/
1487
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001488#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001489PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 const Py_UNICODE *data, /* Unicode char buffer */
1491 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1492 PyObject *table, /* Translate table */
1493 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001494 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001496
Victor Stinner99b95382011-07-04 14:23:54 +02001497#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001498
Guido van Rossumefec1152000-03-28 02:01:15 +00001499/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001500
Mark Hammond91a681d2002-08-12 07:21:58 +00001501PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001502 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001503 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001504 const char *errors /* error handling */
1505 );
1506
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001507PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1508 const char *string, /* MBCS encoded string */
1509 Py_ssize_t length, /* size of string */
1510 const char *errors, /* error handling */
1511 Py_ssize_t *consumed /* bytes consumed */
1512 );
1513
Victor Stinner3a50e702011-10-18 21:21:00 +02001514PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1515 int code_page, /* code page number */
1516 const char *string, /* encoded string */
1517 Py_ssize_t length, /* size of string */
1518 const char *errors, /* error handling */
1519 Py_ssize_t *consumed /* bytes consumed */
1520 );
1521
Mark Hammond91a681d2002-08-12 07:21:58 +00001522PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001523 PyObject *unicode /* Unicode object */
1524 );
1525
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001526#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001527PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001528 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001529 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001530 const char *errors /* error handling */
1531 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001532#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001533
Victor Stinner3a50e702011-10-18 21:21:00 +02001534PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1535 int code_page, /* code page number */
1536 PyObject *unicode, /* Unicode object */
1537 const char *errors /* error handling */
1538 );
1539
Victor Stinner99b95382011-07-04 14:23:54 +02001540#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001541
Guido van Rossum9e896b32000-04-05 20:11:21 +00001542/* --- Decimal Encoder ---------------------------------------------------- */
1543
1544/* Takes a Unicode string holding a decimal value and writes it into
1545 an output buffer using standard ASCII digit codes.
1546
1547 The output buffer has to provide at least length+1 bytes of storage
1548 area. The output string is 0-terminated.
1549
1550 The encoder converts whitespace to ' ', decimal characters to their
1551 corresponding ASCII digit and all other Latin-1 characters except
1552 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1553 are treated as errors. This includes embedded NULL bytes.
1554
1555 Error handling is defined by the errors argument:
1556
1557 NULL or "strict": raise a ValueError
1558 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001560 "replace": replaces illegal characters with '?'
1561
1562 Returns 0 on success, -1 on failure.
1563
1564*/
1565
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001566#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001567PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001568 Py_UNICODE *s, /* Unicode buffer */
1569 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1570 char *output, /* Output buffer; must have size >= length */
1571 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001572 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001573#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001574
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001575/* Transforms code points that have decimal digit property to the
1576 corresponding ASCII digit code points.
1577
1578 Returns a new Unicode string on success, NULL on failure.
1579*/
1580
Georg Brandlb5503082010-12-05 11:40:48 +00001581#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001582PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1583 Py_UNICODE *s, /* Unicode buffer */
1584 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1585 );
Georg Brandlb5503082010-12-05 11:40:48 +00001586#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001587
Victor Stinner6f9568b2011-11-17 00:12:44 +01001588/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 as argument instead of a raw buffer and length. This function additionally
1590 transforms spaces to ASCII because this is what the callers in longobject,
1591 floatobject, and complexobject did anyways. */
1592
1593#ifndef Py_LIMITED_API
1594PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1595 PyObject *unicode /* Unicode object */
1596 );
1597#endif
1598
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001599/* --- Locale encoding --------------------------------------------------- */
1600
1601/* Decode a string from the current locale encoding. The decoder is strict if
1602 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1603 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1604 be decoded as a surrogate character and *surrogateescape* is not equal to
1605 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1606 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001607 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001608
1609PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1610 const char *str,
1611 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001612 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001613
1614/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1615 length using strlen(). */
1616
1617PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1618 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001619 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001620
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001621/* Encode a Unicode object to the current locale encoding. The encoder is
1622 strict is *surrogateescape* is equal to zero, otherwise the
1623 "surrogateescape" error handler is used. Return a bytes object. The string
1624 cannot contain embedded null characters.. */
1625
1626PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1627 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001628 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001629 );
1630
Martin v. Löwis011e8422009-05-05 04:43:17 +00001631/* --- File system encoding ---------------------------------------------- */
1632
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001633/* ParseTuple converter: encode str objects to bytes using
1634 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001635
1636PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1637
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001638/* ParseTuple converter: decode bytes objects to unicode using
1639 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1640
1641PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1642
Victor Stinner77c38622010-05-14 15:58:55 +00001643/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1644 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001645
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001646 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1647 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001648
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001649 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001650*/
1651
1652PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1653 const char *s /* encoded string */
1654 );
1655
Victor Stinner77c38622010-05-14 15:58:55 +00001656/* Decode a string using Py_FileSystemDefaultEncoding
1657 and the "surrogateescape" error handler.
1658
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001659 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1660 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001661*/
1662
Martin v. Löwis011e8422009-05-05 04:43:17 +00001663PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1664 const char *s, /* encoded string */
1665 Py_ssize_t size /* size */
1666 );
1667
Victor Stinnerae6265f2010-05-15 16:27:27 +00001668/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001669 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001670
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001671 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1672 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001673*/
1674
1675PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1676 PyObject *unicode
1677 );
1678
Guido van Rossumd8225182000-03-10 22:33:05 +00001679/* --- Methods & Slots ----------------------------------------------------
1680
1681 These are capable of handling Unicode objects and strings on input
1682 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001683 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001684
1685/* Concat two strings giving a new Unicode string. */
1686
Mark Hammond91a681d2002-08-12 07:21:58 +00001687PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 PyObject *left, /* Left string */
1689 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001690 );
1691
Walter Dörwald1ab83302007-05-18 17:15:44 +00001692/* Concat two strings and put the result in *pleft
1693 (sets *pleft to NULL on error) */
1694
1695PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 PyObject **pleft, /* Pointer to left string */
1697 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001698 );
1699
1700/* Concat two strings, put the result in *pleft and drop the right object
1701 (sets *pleft to NULL on error) */
1702
1703PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 PyObject **pleft, /* Pointer to left string */
1705 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001706 );
1707
Guido van Rossumd8225182000-03-10 22:33:05 +00001708/* Split a string giving a list of Unicode strings.
1709
1710 If sep is NULL, splitting will be done at all whitespace
1711 substrings. Otherwise, splits occur at the given separator.
1712
1713 At most maxsplit splits will be done. If negative, no limit is set.
1714
1715 Separators are not included in the resulting list.
1716
1717*/
1718
Mark Hammond91a681d2002-08-12 07:21:58 +00001719PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 PyObject *s, /* String to split */
1721 PyObject *sep, /* String separator */
1722 Py_ssize_t maxsplit /* Maxsplit count */
1723 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001724
1725/* Dito, but split at line breaks.
1726
1727 CRLF is considered to be one line break. Line breaks are not
1728 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729
Mark Hammond91a681d2002-08-12 07:21:58 +00001730PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001731 PyObject *s, /* String to split */
1732 int keepends /* If true, line end markers are included */
1733 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001734
Thomas Wouters477c8d52006-05-27 19:21:47 +00001735/* Partition a string using a given separator. */
1736
1737PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 PyObject *s, /* String to partition */
1739 PyObject *sep /* String separator */
1740 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001741
1742/* Partition a string using a given separator, searching from the end of the
1743 string. */
1744
1745PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001746 PyObject *s, /* String to partition */
1747 PyObject *sep /* String separator */
1748 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001749
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001750/* Split a string giving a list of Unicode strings.
1751
1752 If sep is NULL, splitting will be done at all whitespace
1753 substrings. Otherwise, splits occur at the given separator.
1754
1755 At most maxsplit splits will be done. But unlike PyUnicode_Split
1756 PyUnicode_RSplit splits from the end of the string. If negative,
1757 no limit is set.
1758
1759 Separators are not included in the resulting list.
1760
1761*/
1762
1763PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001764 PyObject *s, /* String to split */
1765 PyObject *sep, /* String separator */
1766 Py_ssize_t maxsplit /* Maxsplit count */
1767 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001768
Guido van Rossumd8225182000-03-10 22:33:05 +00001769/* Translate a string by applying a character mapping table to it and
1770 return the resulting Unicode object.
1771
1772 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001773 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001774
1775 Mapping tables may be dictionaries or sequences. Unmapped character
1776 ordinals (ones which cause a LookupError) are left untouched and
1777 are copied as-is.
1778
1779*/
1780
Mark Hammond91a681d2002-08-12 07:21:58 +00001781PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001782 PyObject *str, /* String */
1783 PyObject *table, /* Translate table */
1784 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001785 );
1786
1787/* Join a sequence of strings using the given separator and return
1788 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001789
Mark Hammond91a681d2002-08-12 07:21:58 +00001790PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001791 PyObject *separator, /* Separator string */
1792 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001793 );
1794
1795/* Return 1 if substr matches str[start:end] at the given tail end, 0
1796 otherwise. */
1797
Martin v. Löwis18e16552006-02-15 17:27:45 +00001798PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001799 PyObject *str, /* String */
1800 PyObject *substr, /* Prefix or Suffix string */
1801 Py_ssize_t start, /* Start index */
1802 Py_ssize_t end, /* Stop index */
1803 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001804 );
1805
1806/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001807 given search direction or -1 if not found. -2 is returned in case
1808 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001809
Martin v. Löwis18e16552006-02-15 17:27:45 +00001810PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001811 PyObject *str, /* String */
1812 PyObject *substr, /* Substring to find */
1813 Py_ssize_t start, /* Start index */
1814 Py_ssize_t end, /* Stop index */
1815 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001816 );
1817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818/* Like PyUnicode_Find, but search for single character only. */
1819PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1820 PyObject *str,
1821 Py_UCS4 ch,
1822 Py_ssize_t start,
1823 Py_ssize_t end,
1824 int direction
1825 );
1826
Barry Warsaw51ac5802000-03-20 16:36:48 +00001827/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001828
Martin v. Löwis18e16552006-02-15 17:27:45 +00001829PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001830 PyObject *str, /* String */
1831 PyObject *substr, /* Substring to count */
1832 Py_ssize_t start, /* Start index */
1833 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001834 );
1835
Barry Warsaw51ac5802000-03-20 16:36:48 +00001836/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001837 and return the resulting Unicode object. */
1838
Mark Hammond91a681d2002-08-12 07:21:58 +00001839PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001840 PyObject *str, /* String */
1841 PyObject *substr, /* Substring to find */
1842 PyObject *replstr, /* Substring to replace */
1843 Py_ssize_t maxcount /* Max. number of replacements to apply;
1844 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001845 );
1846
1847/* Compare two strings and return -1, 0, 1 for less than, equal,
1848 greater than resp. */
1849
Mark Hammond91a681d2002-08-12 07:21:58 +00001850PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 PyObject *left, /* Left string */
1852 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001853 );
1854
Martin v. Löwis5b222132007-06-10 09:51:05 +00001855PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1856 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001857 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001858 );
1859
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001860/* Rich compare two strings and return one of the following:
1861
1862 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001863 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001864 - Py_NotImplemented in case the type combination is unknown
1865
1866 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1867 case the conversion of the arguments to Unicode fails with a
1868 UnicodeDecodeError.
1869
1870 Possible values for op:
1871
1872 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1873
1874*/
1875
1876PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 PyObject *left, /* Left string */
1878 PyObject *right, /* Right string */
1879 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001880 );
1881
Thomas Wouters7e474022000-07-16 12:04:32 +00001882/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001883 the resulting Unicode string. */
1884
Mark Hammond91a681d2002-08-12 07:21:58 +00001885PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001886 PyObject *format, /* Format string */
1887 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001888 );
1889
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001890/* Checks whether element is contained in container and return 1/0
1891 accordingly.
1892
1893 element has to coerce to an one element Unicode string. -1 is
1894 returned in case of an error. */
1895
Mark Hammond91a681d2002-08-12 07:21:58 +00001896PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001897 PyObject *container, /* Container string */
1898 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001899 );
1900
Martin v. Löwis47383402007-08-15 07:32:56 +00001901/* Checks whether argument is a valid identifier. */
1902
1903PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1904
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001905#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001906/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001907PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001908 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001909 int striptype,
1910 PyObject *sepobj
1911 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001912#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001913
Eric Smith5807c412008-05-11 21:00:57 +00001914/* Using the current locale, insert the thousands grouping
1915 into the string pointed to by buffer. For the argument descriptions,
1916 see Objects/stringlib/localeutil.h */
1917
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001918#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001919PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1920 Py_ssize_t n_buffer,
1921 Py_UNICODE *digits,
1922 Py_ssize_t n_digits,
1923 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001924#endif
Eric Smith5807c412008-05-11 21:00:57 +00001925
Eric Smitha3b1ac82009-04-03 14:45:06 +00001926/* Using explicit passed-in values, insert the thousands grouping
1927 into the string pointed to by buffer. For the argument descriptions,
1928 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001929#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001931 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 int kind,
1933 void *buffer,
1934 Py_ssize_t n_buffer,
1935 void *digits,
1936 Py_ssize_t n_digits,
1937 Py_ssize_t min_width,
1938 const char *grouping,
1939 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001940#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001941/* === Characters Type APIs =============================================== */
1942
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001943/* Helper array used by Py_UNICODE_ISSPACE(). */
1944
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001945#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001946PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1947
Guido van Rossumd8225182000-03-10 22:33:05 +00001948/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001949 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001950
1951 These APIs are implemented in Objects/unicodectype.c.
1952
1953*/
1954
Mark Hammond91a681d2002-08-12 07:21:58 +00001955PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001956 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001957 );
1958
Mark Hammond91a681d2002-08-12 07:21:58 +00001959PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001960 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001961 );
1962
Mark Hammond91a681d2002-08-12 07:21:58 +00001963PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001964 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001965 );
1966
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001967PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001968 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001969 );
1970
1971PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001972 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001973 );
1974
Mark Hammond91a681d2002-08-12 07:21:58 +00001975PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001976 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001977 );
1978
Mark Hammond91a681d2002-08-12 07:21:58 +00001979PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001980 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001981 );
1982
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001983PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1984 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001985 );
1986
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001987PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1988 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001989 );
1990
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001991PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1992 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001993 );
1994
Mark Hammond91a681d2002-08-12 07:21:58 +00001995PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001996 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001997 );
1998
Mark Hammond91a681d2002-08-12 07:21:58 +00001999PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002000 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002001 );
2002
Mark Hammond91a681d2002-08-12 07:21:58 +00002003PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002004 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002005 );
2006
Mark Hammond91a681d2002-08-12 07:21:58 +00002007PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002008 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002009 );
2010
Mark Hammond91a681d2002-08-12 07:21:58 +00002011PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002012 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002013 );
2014
Mark Hammond91a681d2002-08-12 07:21:58 +00002015PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002016 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002017 );
2018
Georg Brandl559e5d72008-06-11 18:37:52 +00002019PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002020 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002021 );
2022
Mark Hammond91a681d2002-08-12 07:21:58 +00002023PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002024 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002025 );
2026
Victor Stinneref8d95c2010-08-16 22:03:11 +00002027PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2028 const Py_UNICODE *u
2029 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002030
2031PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002032 Py_UNICODE *s1,
2033 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002034
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002035PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2036 Py_UNICODE *s1, const Py_UNICODE *s2);
2037
Martin v. Löwis5b222132007-06-10 09:51:05 +00002038PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002039 Py_UNICODE *s1,
2040 const Py_UNICODE *s2,
2041 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002042
2043PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002044 const Py_UNICODE *s1,
2045 const Py_UNICODE *s2
2046 );
2047
2048PyAPI_FUNC(int) Py_UNICODE_strncmp(
2049 const Py_UNICODE *s1,
2050 const Py_UNICODE *s2,
2051 size_t n
2052 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002053
2054PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002055 const Py_UNICODE *s,
2056 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002057 );
2058
Victor Stinner331ea922010-08-10 16:37:20 +00002059PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002060 const Py_UNICODE *s,
2061 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002062 );
2063
Victor Stinner71133ff2010-09-01 23:43:53 +00002064/* Create a copy of a unicode string ending with a nul character. Return NULL
2065 and raise a MemoryError exception on memory allocation failure, otherwise
2066 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2067
Victor Stinner46408602010-09-03 16:18:00 +00002068PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002069 PyObject *unicode
2070 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002071#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002072
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002073#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002074PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002075 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002076 int check_content);
2077#endif
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079/********************* String Literals ****************************************/
2080/* This structure helps managing static strings. The basic usage goes like this:
2081 Instead of doing
2082
2083 r = PyObject_CallMethod(o, "foo", "args", ...);
2084
2085 do
2086
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002087 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 ...
2089 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2090
2091 PyId_foo is a static variable, either on block level or file level. On first
2092 usage, the string "foo" is interned, and the structures are linked. On interpreter
2093 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2094
2095 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002096 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002097 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2098*/
2099typedef struct _Py_Identifier {
2100 struct _Py_Identifier *next;
2101 const char* string;
2102 PyObject *object;
2103} _Py_Identifier;
2104
Martin v. Löwis87da8722011-10-09 11:54:42 +02002105#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002106#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107
2108/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2109PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2110/* Clear all static strings. */
2111PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2112
Guido van Rossumd8225182000-03-10 22:33:05 +00002113#ifdef __cplusplus
2114}
2115#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002116#endif /* !Py_UNICODEOBJECT_H */