blob: c5480f1b46fe1c439092b457e8772184123d7e87 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100223 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200224 * kind = PyUnicode_1BYTE_KIND
225 * compact = 1
226 * ascii = 1
227 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200228 * (length is the length of the utf8 and wstr strings)
229 * (data starts just after the structure)
230 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200231
232 - compact:
233
234 * structure = PyCompactUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100235 * test: PyUnicode_IS_ASCII(op) && !PyUnicode_IS_COMPACT(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237 PyUnicode_4BYTE_KIND
238 * compact = 1
239 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200242 * utf8_length = 0 if utf8 is NULL
243 * wstr is shared with data and wstr_length=length
244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200246 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200247 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100253 * length = 0 (use wstr_length)
254 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * kind = PyUnicode_WCHAR_KIND
256 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200257 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200258 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100259 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200260 * wstr is not NULL
261 * data.any is NULL
262 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200263 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200266
267 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270 PyUnicode_4BYTE_KIND
271 * compact = 0
272 * ready = 1
273 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200274 * utf8 is shared and utf8_length = length with data.any if ascii = 1
275 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100276 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Compact strings use only one memory block (structure + characters),
282 whereas legacy strings use one block for the structure and one block
283 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200284
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200285 Legacy strings are created by PyUnicode_FromUnicode() and
286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287 when PyUnicode_READY() is called.
288
289 See also _PyUnicode_CheckConsistency().
290 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000293 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 struct {
295 /*
296 SSTATE_NOT_INTERNED (0)
297 SSTATE_INTERNED_MORTAL (1)
298 SSTATE_INTERNED_IMMORTAL (2)
299
300 If interned != SSTATE_NOT_INTERNED, the two references from the
301 dictionary to this object are *not* counted in ob_refcnt.
302 */
303 unsigned int interned:2;
304 /* Character size:
305
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200306 - PyUnicode_WCHAR_KIND (0):
307
308 * character type = wchar_t (16 or 32 bits, depending on the
309 platform)
310
311 - PyUnicode_1BYTE_KIND (1):
312
313 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+00FF (latin1)
315 * if ascii is set, all characters are in the range U+0000-U+007F
316 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200317 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 - PyUnicode_2BYTE_KIND (2):
320
321 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100322 * all characters are in the range U+0000-U+FFFF (BMP)
323 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200324
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200325 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200326
327 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100328 * all characters are in the range U+0000-U+10FFFF
329 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200331 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 /* Compact is with respect to the allocation scheme. Compact unicode
333 objects only require one memory block while non-compact objects use
334 one block for the PyUnicodeObject struct and another for its data
335 buffer. */
336 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100337 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 unsigned int ascii:1;
341 /* The ready flag indicates whether the object layout is initialized
342 completely. This means that this is either a compact object, or
343 the data pointer is filled out. The bit is redundant, and helps
344 to minimize the test in PyUnicode_IS_READY(). */
345 unsigned int ready:1;
346 } state;
347 wchar_t *wstr; /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200351 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200352 immediately follow the structure. */
353typedef struct {
354 PyASCIIObject _base;
355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
356 * terminating \0. */
357 char *utf8; /* UTF-8 representation (null-terminated) */
358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
359 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200364 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365typedef struct {
366 PyCompactUnicodeObject _base;
367 union {
368 void *any;
369 Py_UCS1 *latin1;
370 Py_UCS2 *ucs2;
371 Py_UCS4 *ucs4;
372 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000373} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000374#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000375
Mark Hammond91a681d2002-08-12 07:21:58 +0000376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
Thomas Wouters27d517b2007-02-25 20:39:11 +0000379#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000382
383/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000384#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200385
386#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200387 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 ((PyASCIIObject*)op)->length : \
389 ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392 (this includes surrogate pairs as 2 units).
393 If the Py_UNICODE representation is not available, it will be computed
394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
395
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100396#define PyUnicode_GET_SIZE(op) \
397 (assert(PyUnicode_Check(op)), \
398 (((PyASCIIObject *)(op))->wstr) ? \
399 PyUnicode_WSTR_LENGTH(op) : \
400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
401 assert(((PyASCIIObject *)(op))->wstr), \
402 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
408 representation on demand. Using this macro is very inefficient now,
409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410 use PyUnicode_WRITE() and PyUnicode_READ(). */
411
Guido van Rossumd8225182000-03-10 22:33:05 +0000412#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413 (assert(PyUnicode_Check(op)), \
414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415 PyUnicode_AsUnicode((PyObject *)(op)))
416
Guido van Rossumd8225182000-03-10 22:33:05 +0000417#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418 ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200422
Victor Stinner6f9568b2011-11-17 00:12:44 +0100423/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
Victor Stinnera3b334d2011-10-03 13:53:37 +0200430/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432 ready. */
433#define PyUnicode_IS_ASCII(op) \
434 (assert(PyUnicode_Check(op)), \
435 assert(PyUnicode_IS_READY(op)), \
436 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200437
438/* Return true if the string is compact or 0 if not.
439 No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441 (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444 structure), or 0 if not. No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200448enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200450 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200452 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200454 PyUnicode_1BYTE_KIND = 1,
455 PyUnicode_2BYTE_KIND = 2,
456 PyUnicode_4BYTE_KIND = 4
457};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458
Georg Brandl4975a9b2011-10-05 16:12:21 +0200459/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200461 No checks are performed, use PyUnicode_KIND() before to ensure
462 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
467
Victor Stinner157f83f2011-09-28 21:41:31 +0200468/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200469#define PyUnicode_KIND(op) \
470 (assert(PyUnicode_Check(op)), \
471 assert(PyUnicode_IS_READY(op)), \
472 ((PyASCIIObject *)(op))->state.kind)
473
Victor Stinner157f83f2011-09-28 21:41:31 +0200474/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200476 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 ((void*)((PyASCIIObject*)(op) + 1)) : \
478 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
479
480#define _PyUnicode_NONCOMPACT_DATA(op) \
481 (assert(((PyUnicodeObject*)(op))->data.any), \
482 ((((PyUnicodeObject *)(op))->data.any)))
483
484#define PyUnicode_DATA(op) \
485 (assert(PyUnicode_Check(op)), \
486 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
487 _PyUnicode_NONCOMPACT_DATA(op))
488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200489/* In the access macros below, "kind" may be evaluated more than once.
490 All other macro parameters are evaluated exactly once, so it is safe
491 to put side effects into them (such as increasing the index). */
492
493/* Write into the canonical representation, this macro does not do any sanity
494 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200495 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200497 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498#define PyUnicode_WRITE(kind, data, index, value) \
499 do { \
500 switch ((kind)) { \
501 case PyUnicode_1BYTE_KIND: { \
502 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
503 break; \
504 } \
505 case PyUnicode_2BYTE_KIND: { \
506 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
507 break; \
508 } \
509 default: { \
510 assert((kind) == PyUnicode_4BYTE_KIND); \
511 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
512 } \
513 } \
514 } while (0)
515
Georg Brandl07de3252011-10-05 16:47:38 +0200516/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200517 or ready calls are performed. */
518#define PyUnicode_READ(kind, data, index) \
519 ((Py_UCS4) \
520 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200521 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 ((kind) == PyUnicode_2BYTE_KIND ? \
523 ((const Py_UCS2 *)(data))[(index)] : \
524 ((const Py_UCS4 *)(data))[(index)] \
525 ) \
526 ))
527
528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
529 calls PyUnicode_KIND() and might call it twice. For single reads, use
530 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
531 cache kind and use PyUnicode_READ instead. */
532#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200533 (assert(PyUnicode_Check(unicode)), \
534 assert(PyUnicode_IS_READY(unicode)), \
535 (Py_UCS4) \
536 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
537 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
538 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
539 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
540 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
541 ) \
542 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543
544/* Returns the length of the unicode string. The caller has to make sure that
545 the string has it's canonical representation set before calling
546 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
547#define PyUnicode_GET_LENGTH(op) \
548 (assert(PyUnicode_Check(op)), \
549 assert(PyUnicode_IS_READY(op)), \
550 ((PyASCIIObject *)(op))->length)
551
552
553/* Fast check to determine whether an object is ready. Equivalent to
554 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
555
556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
557
Victor Stinnera3b334d2011-10-03 13:53:37 +0200558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200560 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 Returns 0 on success and -1 on errors. */
562#define PyUnicode_READY(op) \
563 (assert(PyUnicode_Check(op)), \
564 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200565 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567/* Return a maximum character value which is suitable for creating another
568 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200569 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570#define PyUnicode_MAX_CHAR_VALUE(op) \
571 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200572 (PyUnicode_IS_ASCII(op) ? \
573 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200575 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200576 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200577 (0xffffU) : \
578 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000580#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000581
582/* --- Constants ---------------------------------------------------------- */
583
584/* This Unicode character will be used as replacement character during
585 decoding if the errors argument is set to "replace". Note: the
586 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
587 Unicode 3.0. */
588
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000590
591/* === Public API ========================================================= */
592
593/* --- Plain Py_UNICODE --------------------------------------------------- */
594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200595/* With PEP 393, this is the recommended way to allocate a new unicode object.
596 This function will allocate the object and its buffer in a single memory
597 block. Objects created using this function are not resizable. */
598#ifndef Py_LIMITED_API
599PyAPI_FUNC(PyObject*) PyUnicode_New(
600 Py_ssize_t size, /* Number of code points in the new string */
601 Py_UCS4 maxchar /* maximum code point value in the string */
602 );
603#endif
604
Victor Stinnerd8f65102011-09-29 19:43:17 +0200605/* Initializes the canonical string representation from a the deprecated
606 wstr/Py_UNICODE representation. This function is used to convert Unicode
607 objects which were created using the old API to the new flexible format
608 introduced with PEP 393.
609
610 Don't call this function directly, use the public PyUnicode_READY() macro
611 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612#ifndef Py_LIMITED_API
613PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200614 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615 );
616#endif
617
Victor Stinner034f6cf2011-09-30 02:26:44 +0200618/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100619#ifndef Py_LIMITED_API
620PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200621 PyObject *unicode
622 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100623#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200626 character conversion when necessary and falls back to memcpy if possible.
627
Victor Stinnera0702ab2011-09-29 14:14:38 +0200628 Fail if to is too small (smaller than how_many or smaller than
629 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
630 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200631
632 Return the number of written character, or return -1 and raise an exception
633 on error.
634
635 Pseudo-code:
636
637 how_many = min(how_many, len(from) - from_start)
638 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
639 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200640
641 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200642 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 PyObject *to,
646 Py_ssize_t to_start,
647 PyObject *from,
648 Py_ssize_t from_start,
649 Py_ssize_t how_many
650 );
651#endif
652
Guido van Rossumd8225182000-03-10 22:33:05 +0000653/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000655
656 u may be NULL which causes the contents to be undefined. It is the
657 user's responsibility to fill in the needed data afterwards. Note
658 that modifying the Unicode object contents after construction is
659 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000660
661 The buffer is copied into the new object. */
662
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000663#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000664PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000665 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000666 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000667 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000668#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000669
Georg Brandl952867a2010-06-27 10:17:12 +0000670/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000672 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000673 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 );
675
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000676/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000678PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000679 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000680 );
681
Victor Stinnerb9275c12011-10-05 14:01:42 +0200682/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
683 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684#ifndef Py_LIMITED_API
685PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
686 int kind,
687 const void *buffer,
688 Py_ssize_t size);
689#endif
690
691PyAPI_FUNC(PyObject*) PyUnicode_Substring(
692 PyObject *str,
693 Py_ssize_t start,
694 Py_ssize_t end);
695
Georg Brandldb6c7f52011-10-07 11:19:11 +0200696/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 is set. Return NULL and raise an exception on error. Raise a ValueError if
698 the buffer is smaller than the string. Return buffer on success.
699
700 buflen is the length of the buffer in (Py_UCS4) characters. */
701PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
702 PyObject *unicode,
703 Py_UCS4* buffer,
704 Py_ssize_t buflen,
705 int copy_null);
706
707/* Copy the string into a UCS4 buffer. A new buffer is allocated using
708 * PyMem_Malloc; if this fails, NULL is returned with a memory error
709 exception set. */
710PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
711
Guido van Rossumd8225182000-03-10 22:33:05 +0000712/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713 Py_UNICODE buffer.
714 If the wchar_t/Py_UNICODE representation is not yet available, this
715 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000716
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000717#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000718PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000720 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000721#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723/* Return a read-only pointer to the Unicode object's internal
724 Py_UNICODE buffer and save the length at size.
725 If the wchar_t/Py_UNICODE representation is not yet available, this
726 function will calculate it. */
727
728#ifndef Py_LIMITED_API
729PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
730 PyObject *unicode, /* Unicode object */
731 Py_ssize_t *size /* location where to save the length */
732 );
733#endif
734
Guido van Rossumd8225182000-03-10 22:33:05 +0000735/* Get the length of the Unicode object. */
736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200737PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
738 PyObject *unicode
739);
740
Victor Stinner157f83f2011-09-28 21:41:31 +0200741/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 string representation. */
743
Martin v. Löwis18e16552006-02-15 17:27:45 +0000744PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000746 );
747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748/* Read a character from the string. */
749
750PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
751 PyObject *unicode,
752 Py_ssize_t index
753 );
754
755/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200756 PyUnicode_New, must not be shared, and must not have been hashed yet.
757
758 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200759
760PyAPI_FUNC(int) PyUnicode_WriteChar(
761 PyObject *unicode,
762 Py_ssize_t index,
763 Py_UCS4 character
764 );
765
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000766#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000767/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000768PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000769#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000770
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100771/* Resize an Unicode object. The length is the number of characters, except
772 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
773 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000774
775 *unicode is modified to point to the new (resized) object and 0
776 returned on success.
777
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100778 Try to resize the string in place (which is usually faster than allocating
779 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000780
781 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100782 is returned and *unicode left untouched.
783
784 WARNING: The function doesn't check string content, the result may not be a
785 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000786
Mark Hammond91a681d2002-08-12 07:21:58 +0000787PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 PyObject **unicode, /* Pointer to the Unicode object */
789 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000790 );
791
Guido van Rossumd8225182000-03-10 22:33:05 +0000792/* Coerce obj to an Unicode object and return a reference with
793 *incremented* refcount.
794
795 Coercion is done in the following way:
796
Georg Brandl952867a2010-06-27 10:17:12 +0000797 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000798 under the assumptions that they contain data using the UTF-8
799 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000800
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000801 2. All other objects (including Unicode objects) raise an
802 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000803
804 The API returns NULL in case of an error. The caller is responsible
805 for decref'ing the returned objects.
806
807*/
808
Mark Hammond91a681d2002-08-12 07:21:58 +0000809PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000811 const char *encoding, /* encoding */
812 const char *errors /* error handling */
813 );
814
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000815/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000816 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000818 Unicode objects are passed back as-is (subclasses are converted to
819 true Unicode objects), all other objects are delegated to
820 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000821 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000822
823 The API returns NULL in case of an error. The caller is responsible
824 for decref'ing the returned objects.
825
826*/
827
Mark Hammond91a681d2002-08-12 07:21:58 +0000828PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000830 );
831
Victor Stinner1205f272010-09-11 00:54:47 +0000832PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
833 const char *format, /* ASCII-encoded string */
834 va_list vargs
835 );
836PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
837 const char *format, /* ASCII-encoded string */
838 ...
839 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000840
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000841#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000842/* Format the object based on the format_spec, as defined in PEP 3101
843 (Advanced String Formatting). */
844PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845 PyObject *format_spec,
846 Py_ssize_t start,
847 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000848#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000849
Walter Dörwald16807132007-05-25 13:52:07 +0000850PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
851PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000852PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
853 const char *u /* UTF-8 encoded string */
854 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000855#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000856PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000857#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000858
859/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860#define PyUnicode_CHECK_INTERNED(op) \
861 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000862
Guido van Rossumd8225182000-03-10 22:33:05 +0000863/* --- wchar_t support for platforms which support it --------------------- */
864
865#ifdef HAVE_WCHAR_H
866
Georg Brandl952867a2010-06-27 10:17:12 +0000867/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000868 size.
869
870 The buffer is copied into the new object. */
871
Mark Hammond91a681d2002-08-12 07:21:58 +0000872PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000873 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000874 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000875 );
876
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000877/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000878 most size wchar_t characters are copied.
879
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000880 Note that the resulting wchar_t string may or may not be
881 0-terminated. It is the responsibility of the caller to make sure
882 that the wchar_t string is 0-terminated in case this is required by
883 the application.
884
885 Returns the number of wchar_t characters copied (excluding a
886 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000887 error. */
888
Martin v. Löwis18e16552006-02-15 17:27:45 +0000889PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000890 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000891 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000892 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000893 );
894
Victor Stinner137c34c2010-09-29 10:25:54 +0000895/* Convert the Unicode object to a wide character string. The output string
896 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200897 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000898
899 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
900 on success. On error, returns NULL, *size is undefined and raises a
901 MemoryError. */
902
903PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000904 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000905 Py_ssize_t *size /* number of characters of the result */
906 );
907
Victor Stinner9f789e72011-10-01 03:57:28 +0200908#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200910#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
Guido van Rossumd8225182000-03-10 22:33:05 +0000912#endif
913
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000914/* --- Unicode ordinals --------------------------------------------------- */
915
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916/* Create a Unicode Object from the given Unicode code point ordinal.
917
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918 The ordinal must be in range(0x10000) on narrow Python builds
919 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
920 raised in case it is not.
921
922*/
923
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000924PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000925
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000926/* --- Free-list management ----------------------------------------------- */
927
928/* Clear the free list used by the Unicode implementation.
929
930 This can be used to release memory used for objects on the free
931 list back to the Python memory allocator.
932
933*/
934
935PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
936
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000937/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000938
939 Many of these APIs take two arguments encoding and errors. These
940 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000941 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000942
Georg Brandl952867a2010-06-27 10:17:12 +0000943 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000944
945 Error handling is set by errors which may also be set to NULL
946 meaning to use the default handling defined for the codec. Default
947 error handling for all builtin codecs is "strict" (ValueErrors are
948 raised).
949
950 The codecs all use a similar interface. Only deviation from the
951 generic ones are documented.
952
953*/
954
Fred Drakecb093fe2000-05-09 19:51:53 +0000955/* --- Manage the default encoding ---------------------------------------- */
956
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000957/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000958 Unicode object unicode and the size of the encoded representation
959 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000960
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000961 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000962
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200963 This function caches the UTF-8 encoded string in the unicodeobject
964 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 when the unicodeobject is deallocated.
966
967 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
968 support the previous internal function with the same behaviour.
969
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000970 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000971 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000972
973 *** If you need to access the Unicode object as UTF-8 bytes string,
974 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000975*/
976
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000977#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000979 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000980 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000982#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000983
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000984/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000985 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
988 in the unicodeobject.
989
990 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
991 support the previous internal function with the same behaviour.
992
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000993 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000994 extracted from the returned data.
995
996 *** This API is for interpreter INTERNAL USE ONLY and will likely
997 *** be removed or changed for Python 3.1.
998
999 *** If you need to access the Unicode object as UTF-8 bytes string,
1000 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001001
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001002*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001003
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001004#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1006#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001007#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001008
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001009/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001010
Mark Hammond91a681d2002-08-12 07:21:58 +00001011PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001012
Guido van Rossumd8225182000-03-10 22:33:05 +00001013/* --- Generic Codecs ----------------------------------------------------- */
1014
1015/* Create a Unicode object by decoding the encoded string s of the
1016 given size. */
1017
Mark Hammond91a681d2002-08-12 07:21:58 +00001018PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001019 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001020 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 const char *encoding, /* encoding */
1022 const char *errors /* error handling */
1023 );
1024
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001025/* Decode a Unicode object unicode and return the result as Python
1026 object. */
1027
1028PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 PyObject *unicode, /* Unicode object */
1030 const char *encoding, /* encoding */
1031 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001032 );
1033
1034/* Decode a Unicode object unicode and return the result as Unicode
1035 object. */
1036
1037PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 PyObject *unicode, /* Unicode object */
1039 const char *encoding, /* encoding */
1040 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001041 );
1042
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001044 Python string object. */
1045
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001046#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001047PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001048 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001049 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001050 const char *encoding, /* encoding */
1051 const char *errors /* error handling */
1052 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001053#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001054
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001055/* Encodes a Unicode object and returns the result as Python
1056 object. */
1057
1058PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 PyObject *unicode, /* Unicode object */
1060 const char *encoding, /* encoding */
1061 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001062 );
1063
Guido van Rossumd8225182000-03-10 22:33:05 +00001064/* Encodes a Unicode object and returns the result as Python string
1065 object. */
1066
Mark Hammond91a681d2002-08-12 07:21:58 +00001067PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001068 PyObject *unicode, /* Unicode object */
1069 const char *encoding, /* encoding */
1070 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001071 );
1072
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001073/* Encodes a Unicode object and returns the result as Unicode
1074 object. */
1075
1076PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 PyObject *unicode, /* Unicode object */
1078 const char *encoding, /* encoding */
1079 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001080 );
1081
1082/* Build an encoding map. */
1083
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001084PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1085 PyObject* string /* 256 character map */
1086 );
1087
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088/* --- UTF-7 Codecs ------------------------------------------------------- */
1089
Mark Hammond91a681d2002-08-12 07:21:58 +00001090PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 const char *string, /* UTF-7 encoded string */
1092 Py_ssize_t length, /* size of string */
1093 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 );
1095
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001096PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 const char *string, /* UTF-7 encoded string */
1098 Py_ssize_t length, /* size of string */
1099 const char *errors, /* error handling */
1100 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001101 );
1102
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001103#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001104PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 const Py_UNICODE *data, /* Unicode char buffer */
1106 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1107 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1108 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1109 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001111PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1112 PyObject *unicode, /* Unicode object */
1113 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1114 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1115 const char *errors /* error handling */
1116 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001117#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118
Guido van Rossumd8225182000-03-10 22:33:05 +00001119/* --- UTF-8 Codecs ------------------------------------------------------- */
1120
Mark Hammond91a681d2002-08-12 07:21:58 +00001121PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 const char *string, /* UTF-8 encoded string */
1123 Py_ssize_t length, /* size of string */
1124 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001125 );
1126
Walter Dörwald69652032004-09-07 20:24:22 +00001127PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 const char *string, /* UTF-8 encoded string */
1129 Py_ssize_t length, /* size of string */
1130 const char *errors, /* error handling */
1131 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001132 );
1133
Mark Hammond91a681d2002-08-12 07:21:58 +00001134PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001136 );
1137
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001138#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1140 PyObject *unicode,
1141 const char *errors);
1142
Mark Hammond91a681d2002-08-12 07:21:58 +00001143PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 const Py_UNICODE *data, /* Unicode char buffer */
1145 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1146 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001147 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001148#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001149
Walter Dörwald41980ca2007-08-16 21:55:45 +00001150/* --- UTF-32 Codecs ------------------------------------------------------ */
1151
1152/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1153 the corresponding Unicode object.
1154
1155 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001156 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001157
1158 If byteorder is non-NULL, the decoder starts decoding using the
1159 given byte order:
1160
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001161 *byteorder == -1: little endian
1162 *byteorder == 0: native order
1163 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001164
1165 In native mode, the first four bytes of the stream are checked for a
1166 BOM mark. If found, the BOM mark is analysed, the byte order
1167 adjusted and the BOM skipped. In the other modes, no BOM mark
1168 interpretation is done. After completion, *byteorder is set to the
1169 current byte order at the end of input data.
1170
1171 If byteorder is NULL, the codec starts in native order mode.
1172
1173*/
1174
1175PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001176 const char *string, /* UTF-32 encoded string */
1177 Py_ssize_t length, /* size of string */
1178 const char *errors, /* error handling */
1179 int *byteorder /* pointer to byteorder to use
1180 0=native;-1=LE,1=BE; updated on
1181 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001182 );
1183
1184PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 const char *string, /* UTF-32 encoded string */
1186 Py_ssize_t length, /* size of string */
1187 const char *errors, /* error handling */
1188 int *byteorder, /* pointer to byteorder to use
1189 0=native;-1=LE,1=BE; updated on
1190 exit */
1191 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001192 );
1193
1194/* Returns a Python string using the UTF-32 encoding in native byte
1195 order. The string always starts with a BOM mark. */
1196
1197PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001198 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001199 );
1200
1201/* Returns a Python string object holding the UTF-32 encoded value of
1202 the Unicode data.
1203
1204 If byteorder is not 0, output is written according to the following
1205 byte order:
1206
1207 byteorder == -1: little endian
1208 byteorder == 0: native byte order (writes a BOM mark)
1209 byteorder == 1: big endian
1210
1211 If byteorder is 0, the output string will always start with the
1212 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1213 prepended.
1214
1215*/
1216
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001217#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001218PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 const Py_UNICODE *data, /* Unicode char buffer */
1220 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1221 const char *errors, /* error handling */
1222 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001223 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001224PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1225 PyObject *object, /* Unicode object */
1226 const char *errors, /* error handling */
1227 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1228 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001229#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001230
Guido van Rossumd8225182000-03-10 22:33:05 +00001231/* --- UTF-16 Codecs ------------------------------------------------------ */
1232
Guido van Rossum9e896b32000-04-05 20:11:21 +00001233/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001234 the corresponding Unicode object.
1235
1236 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001237 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001238
1239 If byteorder is non-NULL, the decoder starts decoding using the
1240 given byte order:
1241
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 *byteorder == -1: little endian
1243 *byteorder == 0: native order
1244 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001245
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001246 In native mode, the first two bytes of the stream are checked for a
1247 BOM mark. If found, the BOM mark is analysed, the byte order
1248 adjusted and the BOM skipped. In the other modes, no BOM mark
1249 interpretation is done. After completion, *byteorder is set to the
1250 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001251
1252 If byteorder is NULL, the codec starts in native order mode.
1253
1254*/
1255
Mark Hammond91a681d2002-08-12 07:21:58 +00001256PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 const char *string, /* UTF-16 encoded string */
1258 Py_ssize_t length, /* size of string */
1259 const char *errors, /* error handling */
1260 int *byteorder /* pointer to byteorder to use
1261 0=native;-1=LE,1=BE; updated on
1262 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001263 );
1264
Walter Dörwald69652032004-09-07 20:24:22 +00001265PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 const char *string, /* UTF-16 encoded string */
1267 Py_ssize_t length, /* size of string */
1268 const char *errors, /* error handling */
1269 int *byteorder, /* pointer to byteorder to use
1270 0=native;-1=LE,1=BE; updated on
1271 exit */
1272 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001273 );
1274
Guido van Rossumd8225182000-03-10 22:33:05 +00001275/* Returns a Python string using the UTF-16 encoding in native byte
1276 order. The string always starts with a BOM mark. */
1277
Mark Hammond91a681d2002-08-12 07:21:58 +00001278PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001280 );
1281
1282/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001283 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001284
1285 If byteorder is not 0, output is written according to the following
1286 byte order:
1287
1288 byteorder == -1: little endian
1289 byteorder == 0: native byte order (writes a BOM mark)
1290 byteorder == 1: big endian
1291
1292 If byteorder is 0, the output string will always start with the
1293 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1294 prepended.
1295
1296 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1297 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001298 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001299
1300*/
1301
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001302#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001303PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 const Py_UNICODE *data, /* Unicode char buffer */
1305 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1306 const char *errors, /* error handling */
1307 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001308 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001309PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1310 PyObject* unicode, /* Unicode object */
1311 const char *errors, /* error handling */
1312 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1313 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001315
1316/* --- Unicode-Escape Codecs ---------------------------------------------- */
1317
Mark Hammond91a681d2002-08-12 07:21:58 +00001318PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 const char *string, /* Unicode-Escape encoded string */
1320 Py_ssize_t length, /* size of string */
1321 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001322 );
1323
Mark Hammond91a681d2002-08-12 07:21:58 +00001324PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
1327
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001328#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001329PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 const Py_UNICODE *data, /* Unicode char buffer */
1331 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001332 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001333#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001334
1335/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1336
Mark Hammond91a681d2002-08-12 07:21:58 +00001337PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 const char *string, /* Raw-Unicode-Escape encoded string */
1339 Py_ssize_t length, /* size of string */
1340 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001341 );
1342
Mark Hammond91a681d2002-08-12 07:21:58 +00001343PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001345 );
1346
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001347#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001348PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349 const Py_UNICODE *data, /* Unicode char buffer */
1350 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001351 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001352#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001353
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001354/* --- Unicode Internal Codec ---------------------------------------------
1355
1356 Only for internal use in _codecsmodule.c */
1357
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001358#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001359PyObject *_PyUnicode_DecodeUnicodeInternal(
1360 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001361 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001362 const char *errors
1363 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001364#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001367
1368 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1369
1370*/
1371
Mark Hammond91a681d2002-08-12 07:21:58 +00001372PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 const char *string, /* Latin-1 encoded string */
1374 Py_ssize_t length, /* size of string */
1375 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001376 );
1377
Mark Hammond91a681d2002-08-12 07:21:58 +00001378PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001380 );
1381
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001382#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1384 PyObject* unicode,
1385 const char* errors);
1386
Mark Hammond91a681d2002-08-12 07:21:58 +00001387PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 const Py_UNICODE *data, /* Unicode char buffer */
1389 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1390 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001391 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001392#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001395
1396 Only 7-bit ASCII data is excepted. All other codes generate errors.
1397
1398*/
1399
Mark Hammond91a681d2002-08-12 07:21:58 +00001400PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 const char *string, /* ASCII encoded string */
1402 Py_ssize_t length, /* size of string */
1403 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001404 );
1405
Mark Hammond91a681d2002-08-12 07:21:58 +00001406PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001408 );
1409
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001410#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1412 PyObject* unicode,
1413 const char* errors);
1414
Mark Hammond91a681d2002-08-12 07:21:58 +00001415PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 const Py_UNICODE *data, /* Unicode char buffer */
1417 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1418 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001419 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001420#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001423
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001425
1426 Decoding mappings must map single string characters to single
1427 Unicode characters, integers (which are then interpreted as Unicode
1428 ordinals) or None (meaning "undefined mapping" and causing an
1429 error).
1430
1431 Encoding mappings must map single Unicode characters to single
1432 string characters, integers (which are then interpreted as Latin-1
1433 ordinals) or None (meaning "undefined mapping" and causing an
1434 error).
1435
1436 If a character lookup fails with a LookupError, the character is
1437 copied as-is meaning that its ordinal value will be interpreted as
1438 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1439 to contain those mappings which map characters to different code
1440 points.
1441
1442*/
1443
Mark Hammond91a681d2002-08-12 07:21:58 +00001444PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 const char *string, /* Encoded string */
1446 Py_ssize_t length, /* size of string */
1447 PyObject *mapping, /* character mapping
1448 (char ordinal -> unicode ordinal) */
1449 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001450 );
1451
Mark Hammond91a681d2002-08-12 07:21:58 +00001452PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453 PyObject *unicode, /* Unicode object */
1454 PyObject *mapping /* character mapping
1455 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001456 );
1457
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001458#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001459PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001460 const Py_UNICODE *data, /* Unicode char buffer */
1461 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1462 PyObject *mapping, /* character mapping
1463 (unicode ordinal -> char ordinal) */
1464 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001465 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001466PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1467 PyObject *unicode, /* Unicode object */
1468 PyObject *mapping, /* character mapping
1469 (unicode ordinal -> char ordinal) */
1470 const char *errors /* error handling */
1471 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001472#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001473
1474/* Translate a Py_UNICODE buffer of the given length by applying a
1475 character mapping table to it and return the resulting Unicode
1476 object.
1477
1478 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001479 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001480
1481 Mapping tables may be dictionaries or sequences. Unmapped character
1482 ordinals (ones which cause a LookupError) are left untouched and
1483 are copied as-is.
1484
1485*/
1486
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001487#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001488PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001489 const Py_UNICODE *data, /* Unicode char buffer */
1490 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1491 PyObject *table, /* Translate table */
1492 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001493 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001494#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001495
Victor Stinner99b95382011-07-04 14:23:54 +02001496#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001497
Guido van Rossumefec1152000-03-28 02:01:15 +00001498/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001499
Mark Hammond91a681d2002-08-12 07:21:58 +00001500PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001501 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001502 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001503 const char *errors /* error handling */
1504 );
1505
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001506PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1507 const char *string, /* MBCS encoded string */
1508 Py_ssize_t length, /* size of string */
1509 const char *errors, /* error handling */
1510 Py_ssize_t *consumed /* bytes consumed */
1511 );
1512
Victor Stinner3a50e702011-10-18 21:21:00 +02001513PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1514 int code_page, /* code page number */
1515 const char *string, /* encoded string */
1516 Py_ssize_t length, /* size of string */
1517 const char *errors, /* error handling */
1518 Py_ssize_t *consumed /* bytes consumed */
1519 );
1520
Mark Hammond91a681d2002-08-12 07:21:58 +00001521PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001522 PyObject *unicode /* Unicode object */
1523 );
1524
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001525#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001526PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001527 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001528 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001529 const char *errors /* error handling */
1530 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001531#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001532
Victor Stinner3a50e702011-10-18 21:21:00 +02001533PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1534 int code_page, /* code page number */
1535 PyObject *unicode, /* Unicode object */
1536 const char *errors /* error handling */
1537 );
1538
Victor Stinner99b95382011-07-04 14:23:54 +02001539#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001540
Guido van Rossum9e896b32000-04-05 20:11:21 +00001541/* --- Decimal Encoder ---------------------------------------------------- */
1542
1543/* Takes a Unicode string holding a decimal value and writes it into
1544 an output buffer using standard ASCII digit codes.
1545
1546 The output buffer has to provide at least length+1 bytes of storage
1547 area. The output string is 0-terminated.
1548
1549 The encoder converts whitespace to ' ', decimal characters to their
1550 corresponding ASCII digit and all other Latin-1 characters except
1551 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1552 are treated as errors. This includes embedded NULL bytes.
1553
1554 Error handling is defined by the errors argument:
1555
1556 NULL or "strict": raise a ValueError
1557 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001558 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001559 "replace": replaces illegal characters with '?'
1560
1561 Returns 0 on success, -1 on failure.
1562
1563*/
1564
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001565#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001566PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 Py_UNICODE *s, /* Unicode buffer */
1568 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1569 char *output, /* Output buffer; must have size >= length */
1570 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001571 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001572#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001573
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001574/* Transforms code points that have decimal digit property to the
1575 corresponding ASCII digit code points.
1576
1577 Returns a new Unicode string on success, NULL on failure.
1578*/
1579
Georg Brandlb5503082010-12-05 11:40:48 +00001580#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001581PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1582 Py_UNICODE *s, /* Unicode buffer */
1583 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1584 );
Georg Brandlb5503082010-12-05 11:40:48 +00001585#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001586
Victor Stinner6f9568b2011-11-17 00:12:44 +01001587/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 as argument instead of a raw buffer and length. This function additionally
1589 transforms spaces to ASCII because this is what the callers in longobject,
1590 floatobject, and complexobject did anyways. */
1591
1592#ifndef Py_LIMITED_API
1593PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1594 PyObject *unicode /* Unicode object */
1595 );
1596#endif
1597
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001598/* --- Locale encoding --------------------------------------------------- */
1599
1600/* Decode a string from the current locale encoding. The decoder is strict if
1601 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1602 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1603 be decoded as a surrogate character and *surrogateescape* is not equal to
1604 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1605 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001606 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001607
1608PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1609 const char *str,
1610 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001611 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001612
1613/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1614 length using strlen(). */
1615
1616PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1617 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001618 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001619
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001620/* Encode a Unicode object to the current locale encoding. The encoder is
1621 strict is *surrogateescape* is equal to zero, otherwise the
1622 "surrogateescape" error handler is used. Return a bytes object. The string
1623 cannot contain embedded null characters.. */
1624
1625PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1626 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001627 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001628 );
1629
Martin v. Löwis011e8422009-05-05 04:43:17 +00001630/* --- File system encoding ---------------------------------------------- */
1631
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001632/* ParseTuple converter: encode str objects to bytes using
1633 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001634
1635PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1636
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001637/* ParseTuple converter: decode bytes objects to unicode using
1638 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1639
1640PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1641
Victor Stinner77c38622010-05-14 15:58:55 +00001642/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1643 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001644
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001645 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1646 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001647
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001648 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001649*/
1650
1651PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1652 const char *s /* encoded string */
1653 );
1654
Victor Stinner77c38622010-05-14 15:58:55 +00001655/* Decode a string using Py_FileSystemDefaultEncoding
1656 and the "surrogateescape" error handler.
1657
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001658 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1659 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001660*/
1661
Martin v. Löwis011e8422009-05-05 04:43:17 +00001662PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1663 const char *s, /* encoded string */
1664 Py_ssize_t size /* size */
1665 );
1666
Victor Stinnerae6265f2010-05-15 16:27:27 +00001667/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001668 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001669
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001670 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1671 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001672*/
1673
1674PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1675 PyObject *unicode
1676 );
1677
Guido van Rossumd8225182000-03-10 22:33:05 +00001678/* --- Methods & Slots ----------------------------------------------------
1679
1680 These are capable of handling Unicode objects and strings on input
1681 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001682 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001683
1684/* Concat two strings giving a new Unicode string. */
1685
Mark Hammond91a681d2002-08-12 07:21:58 +00001686PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001687 PyObject *left, /* Left string */
1688 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001689 );
1690
Walter Dörwald1ab83302007-05-18 17:15:44 +00001691/* Concat two strings and put the result in *pleft
1692 (sets *pleft to NULL on error) */
1693
1694PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 PyObject **pleft, /* Pointer to left string */
1696 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001697 );
1698
1699/* Concat two strings, put the result in *pleft and drop the right object
1700 (sets *pleft to NULL on error) */
1701
1702PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001703 PyObject **pleft, /* Pointer to left string */
1704 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001705 );
1706
Guido van Rossumd8225182000-03-10 22:33:05 +00001707/* Split a string giving a list of Unicode strings.
1708
1709 If sep is NULL, splitting will be done at all whitespace
1710 substrings. Otherwise, splits occur at the given separator.
1711
1712 At most maxsplit splits will be done. If negative, no limit is set.
1713
1714 Separators are not included in the resulting list.
1715
1716*/
1717
Mark Hammond91a681d2002-08-12 07:21:58 +00001718PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 PyObject *s, /* String to split */
1720 PyObject *sep, /* String separator */
1721 Py_ssize_t maxsplit /* Maxsplit count */
1722 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001723
1724/* Dito, but split at line breaks.
1725
1726 CRLF is considered to be one line break. Line breaks are not
1727 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728
Mark Hammond91a681d2002-08-12 07:21:58 +00001729PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 PyObject *s, /* String to split */
1731 int keepends /* If true, line end markers are included */
1732 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001733
Thomas Wouters477c8d52006-05-27 19:21:47 +00001734/* Partition a string using a given separator. */
1735
1736PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 PyObject *s, /* String to partition */
1738 PyObject *sep /* String separator */
1739 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001740
1741/* Partition a string using a given separator, searching from the end of the
1742 string. */
1743
1744PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 PyObject *s, /* String to partition */
1746 PyObject *sep /* String separator */
1747 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001748
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001749/* Split a string giving a list of Unicode strings.
1750
1751 If sep is NULL, splitting will be done at all whitespace
1752 substrings. Otherwise, splits occur at the given separator.
1753
1754 At most maxsplit splits will be done. But unlike PyUnicode_Split
1755 PyUnicode_RSplit splits from the end of the string. If negative,
1756 no limit is set.
1757
1758 Separators are not included in the resulting list.
1759
1760*/
1761
1762PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001763 PyObject *s, /* String to split */
1764 PyObject *sep, /* String separator */
1765 Py_ssize_t maxsplit /* Maxsplit count */
1766 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001767
Guido van Rossumd8225182000-03-10 22:33:05 +00001768/* Translate a string by applying a character mapping table to it and
1769 return the resulting Unicode object.
1770
1771 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001773
1774 Mapping tables may be dictionaries or sequences. Unmapped character
1775 ordinals (ones which cause a LookupError) are left untouched and
1776 are copied as-is.
1777
1778*/
1779
Mark Hammond91a681d2002-08-12 07:21:58 +00001780PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001781 PyObject *str, /* String */
1782 PyObject *table, /* Translate table */
1783 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001784 );
1785
1786/* Join a sequence of strings using the given separator and return
1787 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001788
Mark Hammond91a681d2002-08-12 07:21:58 +00001789PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001790 PyObject *separator, /* Separator string */
1791 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001792 );
1793
1794/* Return 1 if substr matches str[start:end] at the given tail end, 0
1795 otherwise. */
1796
Martin v. Löwis18e16552006-02-15 17:27:45 +00001797PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001798 PyObject *str, /* String */
1799 PyObject *substr, /* Prefix or Suffix string */
1800 Py_ssize_t start, /* Start index */
1801 Py_ssize_t end, /* Stop index */
1802 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001803 );
1804
1805/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001806 given search direction or -1 if not found. -2 is returned in case
1807 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001808
Martin v. Löwis18e16552006-02-15 17:27:45 +00001809PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001810 PyObject *str, /* String */
1811 PyObject *substr, /* Substring to find */
1812 Py_ssize_t start, /* Start index */
1813 Py_ssize_t end, /* Stop index */
1814 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001815 );
1816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817/* Like PyUnicode_Find, but search for single character only. */
1818PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1819 PyObject *str,
1820 Py_UCS4 ch,
1821 Py_ssize_t start,
1822 Py_ssize_t end,
1823 int direction
1824 );
1825
Barry Warsaw51ac5802000-03-20 16:36:48 +00001826/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001827
Martin v. Löwis18e16552006-02-15 17:27:45 +00001828PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001829 PyObject *str, /* String */
1830 PyObject *substr, /* Substring to count */
1831 Py_ssize_t start, /* Start index */
1832 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001833 );
1834
Barry Warsaw51ac5802000-03-20 16:36:48 +00001835/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001836 and return the resulting Unicode object. */
1837
Mark Hammond91a681d2002-08-12 07:21:58 +00001838PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 PyObject *str, /* String */
1840 PyObject *substr, /* Substring to find */
1841 PyObject *replstr, /* Substring to replace */
1842 Py_ssize_t maxcount /* Max. number of replacements to apply;
1843 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001844 );
1845
1846/* Compare two strings and return -1, 0, 1 for less than, equal,
1847 greater than resp. */
1848
Mark Hammond91a681d2002-08-12 07:21:58 +00001849PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 PyObject *left, /* Left string */
1851 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001852 );
1853
Martin v. Löwis5b222132007-06-10 09:51:05 +00001854PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1855 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001856 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001857 );
1858
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001859/* Rich compare two strings and return one of the following:
1860
1861 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001862 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001863 - Py_NotImplemented in case the type combination is unknown
1864
1865 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1866 case the conversion of the arguments to Unicode fails with a
1867 UnicodeDecodeError.
1868
1869 Possible values for op:
1870
1871 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1872
1873*/
1874
1875PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001876 PyObject *left, /* Left string */
1877 PyObject *right, /* Right string */
1878 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001879 );
1880
Thomas Wouters7e474022000-07-16 12:04:32 +00001881/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001882 the resulting Unicode string. */
1883
Mark Hammond91a681d2002-08-12 07:21:58 +00001884PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001885 PyObject *format, /* Format string */
1886 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001887 );
1888
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001889/* Checks whether element is contained in container and return 1/0
1890 accordingly.
1891
1892 element has to coerce to an one element Unicode string. -1 is
1893 returned in case of an error. */
1894
Mark Hammond91a681d2002-08-12 07:21:58 +00001895PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001896 PyObject *container, /* Container string */
1897 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001898 );
1899
Martin v. Löwis47383402007-08-15 07:32:56 +00001900/* Checks whether argument is a valid identifier. */
1901
1902PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1903
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001904#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001905/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001906PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001907 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001908 int striptype,
1909 PyObject *sepobj
1910 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001911#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001912
Eric Smith5807c412008-05-11 21:00:57 +00001913/* Using the current locale, insert the thousands grouping
1914 into the string pointed to by buffer. For the argument descriptions,
1915 see Objects/stringlib/localeutil.h */
1916
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001917#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001918PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1919 Py_ssize_t n_buffer,
1920 Py_UNICODE *digits,
1921 Py_ssize_t n_digits,
1922 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001923#endif
Eric Smith5807c412008-05-11 21:00:57 +00001924
Eric Smitha3b1ac82009-04-03 14:45:06 +00001925/* Using explicit passed-in values, insert the thousands grouping
1926 into the string pointed to by buffer. For the argument descriptions,
1927 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001928#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001930 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 int kind,
1932 void *buffer,
1933 Py_ssize_t n_buffer,
1934 void *digits,
1935 Py_ssize_t n_digits,
1936 Py_ssize_t min_width,
1937 const char *grouping,
1938 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001939#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001940/* === Characters Type APIs =============================================== */
1941
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001942/* Helper array used by Py_UNICODE_ISSPACE(). */
1943
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001944#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001945PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1946
Guido van Rossumd8225182000-03-10 22:33:05 +00001947/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001948 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001949
1950 These APIs are implemented in Objects/unicodectype.c.
1951
1952*/
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001956 );
1957
Mark Hammond91a681d2002-08-12 07:21:58 +00001958PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001959 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960 );
1961
Mark Hammond91a681d2002-08-12 07:21:58 +00001962PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001963 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964 );
1965
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001966PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001967 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001968 );
1969
1970PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001971 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001972 );
1973
Mark Hammond91a681d2002-08-12 07:21:58 +00001974PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001975 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001976 );
1977
Mark Hammond91a681d2002-08-12 07:21:58 +00001978PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001979 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001980 );
1981
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001982PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1983 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001984 );
1985
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001986PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1987 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001988 );
1989
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001990PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1991 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001992 );
1993
Mark Hammond91a681d2002-08-12 07:21:58 +00001994PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001995 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001996 );
1997
Mark Hammond91a681d2002-08-12 07:21:58 +00001998PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001999 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002000 );
2001
Mark Hammond91a681d2002-08-12 07:21:58 +00002002PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002003 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002004 );
2005
Mark Hammond91a681d2002-08-12 07:21:58 +00002006PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002007 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002008 );
2009
Mark Hammond91a681d2002-08-12 07:21:58 +00002010PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002011 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002012 );
2013
Mark Hammond91a681d2002-08-12 07:21:58 +00002014PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002015 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002016 );
2017
Georg Brandl559e5d72008-06-11 18:37:52 +00002018PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002019 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002020 );
2021
Mark Hammond91a681d2002-08-12 07:21:58 +00002022PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002023 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002024 );
2025
Victor Stinneref8d95c2010-08-16 22:03:11 +00002026PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2027 const Py_UNICODE *u
2028 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002029
2030PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002031 Py_UNICODE *s1,
2032 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002033
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002034PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2035 Py_UNICODE *s1, const Py_UNICODE *s2);
2036
Martin v. Löwis5b222132007-06-10 09:51:05 +00002037PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002038 Py_UNICODE *s1,
2039 const Py_UNICODE *s2,
2040 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002041
2042PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002043 const Py_UNICODE *s1,
2044 const Py_UNICODE *s2
2045 );
2046
2047PyAPI_FUNC(int) Py_UNICODE_strncmp(
2048 const Py_UNICODE *s1,
2049 const Py_UNICODE *s2,
2050 size_t n
2051 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002052
2053PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002054 const Py_UNICODE *s,
2055 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002056 );
2057
Victor Stinner331ea922010-08-10 16:37:20 +00002058PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002059 const Py_UNICODE *s,
2060 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002061 );
2062
Victor Stinner71133ff2010-09-01 23:43:53 +00002063/* Create a copy of a unicode string ending with a nul character. Return NULL
2064 and raise a MemoryError exception on memory allocation failure, otherwise
2065 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2066
Victor Stinner46408602010-09-03 16:18:00 +00002067PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002068 PyObject *unicode
2069 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002070#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002071
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002072#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002073PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002074 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002075 int check_content);
2076#endif
2077
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002078/********************* String Literals ****************************************/
2079/* This structure helps managing static strings. The basic usage goes like this:
2080 Instead of doing
2081
2082 r = PyObject_CallMethod(o, "foo", "args", ...);
2083
2084 do
2085
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002086 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002087 ...
2088 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2089
2090 PyId_foo is a static variable, either on block level or file level. On first
2091 usage, the string "foo" is interned, and the structures are linked. On interpreter
2092 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2093
2094 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002095 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002096 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2097*/
2098typedef struct _Py_Identifier {
2099 struct _Py_Identifier *next;
2100 const char* string;
2101 PyObject *object;
2102} _Py_Identifier;
2103
Martin v. Löwis87da8722011-10-09 11:54:42 +02002104#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002105#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002106
2107/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2108PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2109/* Clear all static strings. */
2110PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2111
Guido van Rossumd8225182000-03-10 22:33:05 +00002112#ifdef __cplusplus
2113}
2114#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002115#endif /* !Py_UNICODEOBJECT_H */