blob: 836bafb527c7d7d2ab70a81540d5482a34a1518a [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
223 * kind = PyUnicode_1BYTE_KIND
224 * compact = 1
225 * ascii = 1
226 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200227 * (length is the length of the utf8 and wstr strings)
228 * (data starts just after the structure)
229 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200230
231 - compact:
232
233 * structure = PyCompactUnicodeObject
234 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
235 PyUnicode_4BYTE_KIND
236 * compact = 1
237 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200238 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200239 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200240 * utf8_length = 0 if utf8 is NULL
241 * wstr is shared with data and wstr_length=length
242 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100243 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200244 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200245 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200246
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200247 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200248
249 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100250 * length = 0 (use wstr_length)
251 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200252 * kind = PyUnicode_WCHAR_KIND
253 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200254 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100256 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200257 * wstr is not NULL
258 * data.any is NULL
259 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200260 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200261
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200262 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200263
264 * structure = PyUnicodeObject structure
265 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
266 PyUnicode_4BYTE_KIND
267 * compact = 0
268 * ready = 1
269 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200270 * utf8 is shared and utf8_length = length with data.any if ascii = 1
271 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100272 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200273 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
274 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
275 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200276
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200277 Compact strings use only one memory block (structure + characters),
278 whereas legacy strings use one block for the structure and one block
279 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Legacy strings are created by PyUnicode_FromUnicode() and
282 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
283 when PyUnicode_READY() is called.
284
285 See also _PyUnicode_CheckConsistency().
286 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000287 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200288 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000289 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200290 struct {
291 /*
292 SSTATE_NOT_INTERNED (0)
293 SSTATE_INTERNED_MORTAL (1)
294 SSTATE_INTERNED_IMMORTAL (2)
295
296 If interned != SSTATE_NOT_INTERNED, the two references from the
297 dictionary to this object are *not* counted in ob_refcnt.
298 */
299 unsigned int interned:2;
300 /* Character size:
301
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302 - PyUnicode_WCHAR_KIND (0):
303
304 * character type = wchar_t (16 or 32 bits, depending on the
305 platform)
306
307 - PyUnicode_1BYTE_KIND (1):
308
309 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100310 * all characters are in the range U+0000-U+00FF (latin1)
311 * if ascii is set, all characters are in the range U+0000-U+007F
312 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200313 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200314
315 - PyUnicode_2BYTE_KIND (2):
316
317 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100318 * all characters are in the range U+0000-U+FFFF (BMP)
319 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200320
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200321 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200322
323 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100324 * all characters are in the range U+0000-U+10FFFF
325 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200326 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200327 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200328 /* Compact is with respect to the allocation scheme. Compact unicode
329 objects only require one memory block while non-compact objects use
330 one block for the PyUnicodeObject struct and another for its data
331 buffer. */
332 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100333 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200334 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
335 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200336 unsigned int ascii:1;
337 /* The ready flag indicates whether the object layout is initialized
338 completely. This means that this is either a compact object, or
339 the data pointer is filled out. The bit is redundant, and helps
340 to minimize the test in PyUnicode_IS_READY(). */
341 unsigned int ready:1;
342 } state;
343 wchar_t *wstr; /* wchar_t representation (null-terminated) */
344} PyASCIIObject;
345
346/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200347 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 immediately follow the structure. */
349typedef struct {
350 PyASCIIObject _base;
351 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
352 * terminating \0. */
353 char *utf8; /* UTF-8 representation (null-terminated) */
354 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
355 * surrogates count as two code points. */
356} PyCompactUnicodeObject;
357
358/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
359 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200360 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200361typedef struct {
362 PyCompactUnicodeObject _base;
363 union {
364 void *any;
365 Py_UCS1 *latin1;
366 Py_UCS2 *ucs2;
367 Py_UCS4 *ucs4;
368 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000369} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000370#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000371
Mark Hammond91a681d2002-08-12 07:21:58 +0000372PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000373PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000374
Thomas Wouters27d517b2007-02-25 20:39:11 +0000375#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000376 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
377#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
379/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000380#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381
382#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200383 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 ((PyASCIIObject*)op)->length : \
385 ((PyCompactUnicodeObject*)op)->wstr_length)
386
387/* Returns the deprecated Py_UNICODE representation's size in code units
388 (this includes surrogate pairs as 2 units).
389 If the Py_UNICODE representation is not available, it will be computed
390 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
391
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100392#define PyUnicode_GET_SIZE(op) \
393 (assert(PyUnicode_Check(op)), \
394 (((PyASCIIObject *)(op))->wstr) ? \
395 PyUnicode_WSTR_LENGTH(op) : \
396 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
397 assert(((PyASCIIObject *)(op))->wstr), \
398 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399
Guido van Rossumd8225182000-03-10 22:33:05 +0000400#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
402
403/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
404 representation on demand. Using this macro is very inefficient now,
405 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
406 use PyUnicode_WRITE() and PyUnicode_READ(). */
407
Guido van Rossumd8225182000-03-10 22:33:05 +0000408#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409 (assert(PyUnicode_Check(op)), \
410 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
411 PyUnicode_AsUnicode((PyObject *)(op)))
412
Guido van Rossumd8225182000-03-10 22:33:05 +0000413#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414 ((const char *)(PyUnicode_AS_UNICODE(op)))
415
416
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200417/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418
Victor Stinner6f9568b2011-11-17 00:12:44 +0100419/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200420
421/* Interning state. */
422#define SSTATE_NOT_INTERNED 0
423#define SSTATE_INTERNED_MORTAL 1
424#define SSTATE_INTERNED_IMMORTAL 2
425
Victor Stinnera3b334d2011-10-03 13:53:37 +0200426/* Return true if the string contains only ASCII characters, or 0 if not. The
427 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
428 or Ready calls are performed. */
429#define PyUnicode_IS_ASCII(op) \
430 (((PyASCIIObject*)op)->state.ascii)
431
432/* Return true if the string is compact or 0 if not.
433 No type checks or Ready calls are performed. */
434#define PyUnicode_IS_COMPACT(op) \
435 (((PyASCIIObject*)(op))->state.compact)
436
437/* Return true if the string is a compact ASCII string (use PyASCIIObject
438 structure), or 0 if not. No type checks or Ready calls are performed. */
439#define PyUnicode_IS_COMPACT_ASCII(op) \
440 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200442enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200444 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200446 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200448 PyUnicode_1BYTE_KIND = 1,
449 PyUnicode_2BYTE_KIND = 2,
450 PyUnicode_4BYTE_KIND = 4
451};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452
Georg Brandl4975a9b2011-10-05 16:12:21 +0200453/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200455 No checks are performed, use PyUnicode_KIND() before to ensure
456 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200457
458#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
459#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
460#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
461
Victor Stinner157f83f2011-09-28 21:41:31 +0200462/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463#define PyUnicode_KIND(op) \
464 (assert(PyUnicode_Check(op)), \
465 assert(PyUnicode_IS_READY(op)), \
466 ((PyASCIIObject *)(op))->state.kind)
467
Victor Stinner157f83f2011-09-28 21:41:31 +0200468/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200469#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200470 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 ((void*)((PyASCIIObject*)(op) + 1)) : \
472 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
473
474#define _PyUnicode_NONCOMPACT_DATA(op) \
475 (assert(((PyUnicodeObject*)(op))->data.any), \
476 ((((PyUnicodeObject *)(op))->data.any)))
477
478#define PyUnicode_DATA(op) \
479 (assert(PyUnicode_Check(op)), \
480 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
481 _PyUnicode_NONCOMPACT_DATA(op))
482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200483/* In the access macros below, "kind" may be evaluated more than once.
484 All other macro parameters are evaluated exactly once, so it is safe
485 to put side effects into them (such as increasing the index). */
486
487/* Write into the canonical representation, this macro does not do any sanity
488 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200489 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200491 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492#define PyUnicode_WRITE(kind, data, index, value) \
493 do { \
494 switch ((kind)) { \
495 case PyUnicode_1BYTE_KIND: { \
496 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
497 break; \
498 } \
499 case PyUnicode_2BYTE_KIND: { \
500 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
501 break; \
502 } \
503 default: { \
504 assert((kind) == PyUnicode_4BYTE_KIND); \
505 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
506 } \
507 } \
508 } while (0)
509
Georg Brandl07de3252011-10-05 16:47:38 +0200510/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 or ready calls are performed. */
512#define PyUnicode_READ(kind, data, index) \
513 ((Py_UCS4) \
514 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200515 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200516 ((kind) == PyUnicode_2BYTE_KIND ? \
517 ((const Py_UCS2 *)(data))[(index)] : \
518 ((const Py_UCS4 *)(data))[(index)] \
519 ) \
520 ))
521
522/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
523 calls PyUnicode_KIND() and might call it twice. For single reads, use
524 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
525 cache kind and use PyUnicode_READ instead. */
526#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200527 (assert(PyUnicode_Check(unicode)), \
528 assert(PyUnicode_IS_READY(unicode)), \
529 (Py_UCS4) \
530 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
531 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
532 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
533 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
534 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
535 ) \
536 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537
538/* Returns the length of the unicode string. The caller has to make sure that
539 the string has it's canonical representation set before calling
540 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
541#define PyUnicode_GET_LENGTH(op) \
542 (assert(PyUnicode_Check(op)), \
543 assert(PyUnicode_IS_READY(op)), \
544 ((PyASCIIObject *)(op))->length)
545
546
547/* Fast check to determine whether an object is ready. Equivalent to
548 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
549
550#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
551
Victor Stinnera3b334d2011-10-03 13:53:37 +0200552/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200554 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 Returns 0 on success and -1 on errors. */
556#define PyUnicode_READY(op) \
557 (assert(PyUnicode_Check(op)), \
558 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200559 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561/* Return a maximum character value which is suitable for creating another
562 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200563 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#define PyUnicode_MAX_CHAR_VALUE(op) \
565 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200566 (PyUnicode_IS_ASCII(op) ? \
567 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200568 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200569 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200571 (0xffffU) : \
572 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000574#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000575
576/* --- Constants ---------------------------------------------------------- */
577
578/* This Unicode character will be used as replacement character during
579 decoding if the errors argument is set to "replace". Note: the
580 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
581 Unicode 3.0. */
582
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200583#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000584
585/* === Public API ========================================================= */
586
587/* --- Plain Py_UNICODE --------------------------------------------------- */
588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200589/* With PEP 393, this is the recommended way to allocate a new unicode object.
590 This function will allocate the object and its buffer in a single memory
591 block. Objects created using this function are not resizable. */
592#ifndef Py_LIMITED_API
593PyAPI_FUNC(PyObject*) PyUnicode_New(
594 Py_ssize_t size, /* Number of code points in the new string */
595 Py_UCS4 maxchar /* maximum code point value in the string */
596 );
597#endif
598
Victor Stinnerd8f65102011-09-29 19:43:17 +0200599/* Initializes the canonical string representation from a the deprecated
600 wstr/Py_UNICODE representation. This function is used to convert Unicode
601 objects which were created using the old API to the new flexible format
602 introduced with PEP 393.
603
604 Don't call this function directly, use the public PyUnicode_READY() macro
605 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606#ifndef Py_LIMITED_API
607PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200608 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609 );
610#endif
611
Victor Stinner034f6cf2011-09-30 02:26:44 +0200612/* Get a copy of a Unicode string. */
613PyAPI_FUNC(PyObject*) PyUnicode_Copy(
614 PyObject *unicode
615 );
616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618 character conversion when necessary and falls back to memcpy if possible.
619
Victor Stinnera0702ab2011-09-29 14:14:38 +0200620 Fail if to is too small (smaller than how_many or smaller than
621 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
622 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200623
624 Return the number of written character, or return -1 and raise an exception
625 on error.
626
627 Pseudo-code:
628
629 how_many = min(how_many, len(from) - from_start)
630 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
631 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200632
633 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200634 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200636PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637 PyObject *to,
638 Py_ssize_t to_start,
639 PyObject *from,
640 Py_ssize_t from_start,
641 Py_ssize_t how_many
642 );
643#endif
644
Guido van Rossumd8225182000-03-10 22:33:05 +0000645/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000647
648 u may be NULL which causes the contents to be undefined. It is the
649 user's responsibility to fill in the needed data afterwards. Note
650 that modifying the Unicode object contents after construction is
651 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000652
653 The buffer is copied into the new object. */
654
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000655#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000656PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000657 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000658 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000659 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000660#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000661
Georg Brandl952867a2010-06-27 10:17:12 +0000662/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000663PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000664 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000665 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666 );
667
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000668/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000670PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000671 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000672 );
673
Victor Stinnerb9275c12011-10-05 14:01:42 +0200674/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
675 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676#ifndef Py_LIMITED_API
677PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
678 int kind,
679 const void *buffer,
680 Py_ssize_t size);
681#endif
682
683PyAPI_FUNC(PyObject*) PyUnicode_Substring(
684 PyObject *str,
685 Py_ssize_t start,
686 Py_ssize_t end);
687
Georg Brandldb6c7f52011-10-07 11:19:11 +0200688/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200689 is set. Return NULL and raise an exception on error. Raise a ValueError if
690 the buffer is smaller than the string. Return buffer on success.
691
692 buflen is the length of the buffer in (Py_UCS4) characters. */
693PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
694 PyObject *unicode,
695 Py_UCS4* buffer,
696 Py_ssize_t buflen,
697 int copy_null);
698
699/* Copy the string into a UCS4 buffer. A new buffer is allocated using
700 * PyMem_Malloc; if this fails, NULL is returned with a memory error
701 exception set. */
702PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
703
Guido van Rossumd8225182000-03-10 22:33:05 +0000704/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 Py_UNICODE buffer.
706 If the wchar_t/Py_UNICODE representation is not yet available, this
707 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000708
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000709#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000710PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000712 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000713#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715/* Return a read-only pointer to the Unicode object's internal
716 Py_UNICODE buffer and save the length at size.
717 If the wchar_t/Py_UNICODE representation is not yet available, this
718 function will calculate it. */
719
720#ifndef Py_LIMITED_API
721PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
722 PyObject *unicode, /* Unicode object */
723 Py_ssize_t *size /* location where to save the length */
724 );
725#endif
726
Guido van Rossumd8225182000-03-10 22:33:05 +0000727/* Get the length of the Unicode object. */
728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
730 PyObject *unicode
731);
732
Victor Stinner157f83f2011-09-28 21:41:31 +0200733/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734 string representation. */
735
Martin v. Löwis18e16552006-02-15 17:27:45 +0000736PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000738 );
739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740/* Read a character from the string. */
741
742PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
743 PyObject *unicode,
744 Py_ssize_t index
745 );
746
747/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200748 PyUnicode_New, must not be shared, and must not have been hashed yet.
749
750 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751
752PyAPI_FUNC(int) PyUnicode_WriteChar(
753 PyObject *unicode,
754 Py_ssize_t index,
755 Py_UCS4 character
756 );
757
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000758#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000759/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000760PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000761#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000762
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200763/* Resize an Unicode object allocated by the legacy API (e.g.
764 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
765 PyUnicode_New) cannot be resized by this function.
766
Victor Stinner93439992011-11-20 18:29:14 +0100767 The length is a number of characters (and not the number of Py_UNICODE characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000768
769 *unicode is modified to point to the new (resized) object and 0
770 returned on success.
771
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200772 If the refcount on the object is 1, the function resizes the string in
773 place, which is usually faster than allocating a new string (and copy
774 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000775
776 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000778
Mark Hammond91a681d2002-08-12 07:21:58 +0000779PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 PyObject **unicode, /* Pointer to the Unicode object */
781 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000782 );
783
Guido van Rossumd8225182000-03-10 22:33:05 +0000784/* Coerce obj to an Unicode object and return a reference with
785 *incremented* refcount.
786
787 Coercion is done in the following way:
788
Georg Brandl952867a2010-06-27 10:17:12 +0000789 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000790 under the assumptions that they contain data using the UTF-8
791 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000792
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000793 2. All other objects (including Unicode objects) raise an
794 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000795
796 The API returns NULL in case of an error. The caller is responsible
797 for decref'ing the returned objects.
798
799*/
800
Mark Hammond91a681d2002-08-12 07:21:58 +0000801PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000803 const char *encoding, /* encoding */
804 const char *errors /* error handling */
805 );
806
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000807/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000808 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000810 Unicode objects are passed back as-is (subclasses are converted to
811 true Unicode objects), all other objects are delegated to
812 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000813 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000814
815 The API returns NULL in case of an error. The caller is responsible
816 for decref'ing the returned objects.
817
818*/
819
Mark Hammond91a681d2002-08-12 07:21:58 +0000820PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 );
823
Victor Stinner1205f272010-09-11 00:54:47 +0000824PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
825 const char *format, /* ASCII-encoded string */
826 va_list vargs
827 );
828PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
829 const char *format, /* ASCII-encoded string */
830 ...
831 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000832
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000833#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000834/* Format the object based on the format_spec, as defined in PEP 3101
835 (Advanced String Formatting). */
836PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837 PyObject *format_spec,
838 Py_ssize_t start,
839 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000840#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000841
Walter Dörwald16807132007-05-25 13:52:07 +0000842PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
843PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000844PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
845 const char *u /* UTF-8 encoded string */
846 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000847#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000848PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000849#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000850
851/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852#define PyUnicode_CHECK_INTERNED(op) \
853 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000854
Guido van Rossumd8225182000-03-10 22:33:05 +0000855/* --- wchar_t support for platforms which support it --------------------- */
856
857#ifdef HAVE_WCHAR_H
858
Georg Brandl952867a2010-06-27 10:17:12 +0000859/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000860 size.
861
862 The buffer is copied into the new object. */
863
Mark Hammond91a681d2002-08-12 07:21:58 +0000864PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000865 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 );
868
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000869/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000870 most size wchar_t characters are copied.
871
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000872 Note that the resulting wchar_t string may or may not be
873 0-terminated. It is the responsibility of the caller to make sure
874 that the wchar_t string is 0-terminated in case this is required by
875 the application.
876
877 Returns the number of wchar_t characters copied (excluding a
878 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000879 error. */
880
Martin v. Löwis18e16552006-02-15 17:27:45 +0000881PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000882 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000883 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000884 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000885 );
886
Victor Stinner137c34c2010-09-29 10:25:54 +0000887/* Convert the Unicode object to a wide character string. The output string
888 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200889 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000890
891 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
892 on success. On error, returns NULL, *size is undefined and raises a
893 MemoryError. */
894
895PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000896 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000897 Py_ssize_t *size /* number of characters of the result */
898 );
899
Victor Stinner9f789e72011-10-01 03:57:28 +0200900#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200902#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903
Guido van Rossumd8225182000-03-10 22:33:05 +0000904#endif
905
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000906/* --- Unicode ordinals --------------------------------------------------- */
907
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908/* Create a Unicode Object from the given Unicode code point ordinal.
909
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000910 The ordinal must be in range(0x10000) on narrow Python builds
911 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
912 raised in case it is not.
913
914*/
915
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000916PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000918/* --- Free-list management ----------------------------------------------- */
919
920/* Clear the free list used by the Unicode implementation.
921
922 This can be used to release memory used for objects on the free
923 list back to the Python memory allocator.
924
925*/
926
927PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
928
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000929/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000930
931 Many of these APIs take two arguments encoding and errors. These
932 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000933 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000934
Georg Brandl952867a2010-06-27 10:17:12 +0000935 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000936
937 Error handling is set by errors which may also be set to NULL
938 meaning to use the default handling defined for the codec. Default
939 error handling for all builtin codecs is "strict" (ValueErrors are
940 raised).
941
942 The codecs all use a similar interface. Only deviation from the
943 generic ones are documented.
944
945*/
946
Fred Drakecb093fe2000-05-09 19:51:53 +0000947/* --- Manage the default encoding ---------------------------------------- */
948
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000949/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000950 Unicode object unicode and the size of the encoded representation
951 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000952
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000953 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000954
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200955 This function caches the UTF-8 encoded string in the unicodeobject
956 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 when the unicodeobject is deallocated.
958
959 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
960 support the previous internal function with the same behaviour.
961
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000962 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000963 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000964
965 *** If you need to access the Unicode object as UTF-8 bytes string,
966 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000967*/
968
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000969#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000971 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000972 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000974#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000975
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000976/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000977 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
980 in the unicodeobject.
981
982 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
983 support the previous internal function with the same behaviour.
984
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000985 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000986 extracted from the returned data.
987
988 *** This API is for interpreter INTERNAL USE ONLY and will likely
989 *** be removed or changed for Python 3.1.
990
991 *** If you need to access the Unicode object as UTF-8 bytes string,
992 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000993
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000994*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000995
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000996#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
998#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000999#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001000
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001001/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001002
Mark Hammond91a681d2002-08-12 07:21:58 +00001003PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001004
Guido van Rossumd8225182000-03-10 22:33:05 +00001005/* --- Generic Codecs ----------------------------------------------------- */
1006
1007/* Create a Unicode object by decoding the encoded string s of the
1008 given size. */
1009
Mark Hammond91a681d2002-08-12 07:21:58 +00001010PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001012 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001013 const char *encoding, /* encoding */
1014 const char *errors /* error handling */
1015 );
1016
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001017/* Decode a Unicode object unicode and return the result as Python
1018 object. */
1019
1020PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 PyObject *unicode, /* Unicode object */
1022 const char *encoding, /* encoding */
1023 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001024 );
1025
1026/* Decode a Unicode object unicode and return the result as Unicode
1027 object. */
1028
1029PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 PyObject *unicode, /* Unicode object */
1031 const char *encoding, /* encoding */
1032 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001033 );
1034
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001035/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 Python string object. */
1037
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001038#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001039PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 const char *encoding, /* encoding */
1043 const char *errors /* error handling */
1044 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001045#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001046
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001047/* Encodes a Unicode object and returns the result as Python
1048 object. */
1049
1050PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 PyObject *unicode, /* Unicode object */
1052 const char *encoding, /* encoding */
1053 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001054 );
1055
Guido van Rossumd8225182000-03-10 22:33:05 +00001056/* Encodes a Unicode object and returns the result as Python string
1057 object. */
1058
Mark Hammond91a681d2002-08-12 07:21:58 +00001059PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 PyObject *unicode, /* Unicode object */
1061 const char *encoding, /* encoding */
1062 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001063 );
1064
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001065/* Encodes a Unicode object and returns the result as Unicode
1066 object. */
1067
1068PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 PyObject *unicode, /* Unicode object */
1070 const char *encoding, /* encoding */
1071 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001072 );
1073
1074/* Build an encoding map. */
1075
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001076PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1077 PyObject* string /* 256 character map */
1078 );
1079
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080/* --- UTF-7 Codecs ------------------------------------------------------- */
1081
Mark Hammond91a681d2002-08-12 07:21:58 +00001082PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001083 const char *string, /* UTF-7 encoded string */
1084 Py_ssize_t length, /* size of string */
1085 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 );
1087
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001088PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 const char *string, /* UTF-7 encoded string */
1090 Py_ssize_t length, /* size of string */
1091 const char *errors, /* error handling */
1092 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001093 );
1094
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001095#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001096PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 const Py_UNICODE *data, /* Unicode char buffer */
1098 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1099 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1100 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1101 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001103PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1104 PyObject *unicode, /* Unicode object */
1105 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1106 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1107 const char *errors /* error handling */
1108 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001109#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110
Guido van Rossumd8225182000-03-10 22:33:05 +00001111/* --- UTF-8 Codecs ------------------------------------------------------- */
1112
Mark Hammond91a681d2002-08-12 07:21:58 +00001113PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001114 const char *string, /* UTF-8 encoded string */
1115 Py_ssize_t length, /* size of string */
1116 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001117 );
1118
Walter Dörwald69652032004-09-07 20:24:22 +00001119PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001120 const char *string, /* UTF-8 encoded string */
1121 Py_ssize_t length, /* size of string */
1122 const char *errors, /* error handling */
1123 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001124 );
1125
Mark Hammond91a681d2002-08-12 07:21:58 +00001126PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001127 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001128 );
1129
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001130#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1132 PyObject *unicode,
1133 const char *errors);
1134
Mark Hammond91a681d2002-08-12 07:21:58 +00001135PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 const Py_UNICODE *data, /* Unicode char buffer */
1137 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1138 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001139 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001140#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001141
Walter Dörwald41980ca2007-08-16 21:55:45 +00001142/* --- UTF-32 Codecs ------------------------------------------------------ */
1143
1144/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1145 the corresponding Unicode object.
1146
1147 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001149
1150 If byteorder is non-NULL, the decoder starts decoding using the
1151 given byte order:
1152
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 *byteorder == -1: little endian
1154 *byteorder == 0: native order
1155 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001156
1157 In native mode, the first four bytes of the stream are checked for a
1158 BOM mark. If found, the BOM mark is analysed, the byte order
1159 adjusted and the BOM skipped. In the other modes, no BOM mark
1160 interpretation is done. After completion, *byteorder is set to the
1161 current byte order at the end of input data.
1162
1163 If byteorder is NULL, the codec starts in native order mode.
1164
1165*/
1166
1167PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 const char *string, /* UTF-32 encoded string */
1169 Py_ssize_t length, /* size of string */
1170 const char *errors, /* error handling */
1171 int *byteorder /* pointer to byteorder to use
1172 0=native;-1=LE,1=BE; updated on
1173 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001174 );
1175
1176PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 const char *string, /* UTF-32 encoded string */
1178 Py_ssize_t length, /* size of string */
1179 const char *errors, /* error handling */
1180 int *byteorder, /* pointer to byteorder to use
1181 0=native;-1=LE,1=BE; updated on
1182 exit */
1183 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001184 );
1185
1186/* Returns a Python string using the UTF-32 encoding in native byte
1187 order. The string always starts with a BOM mark. */
1188
1189PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001191 );
1192
1193/* Returns a Python string object holding the UTF-32 encoded value of
1194 the Unicode data.
1195
1196 If byteorder is not 0, output is written according to the following
1197 byte order:
1198
1199 byteorder == -1: little endian
1200 byteorder == 0: native byte order (writes a BOM mark)
1201 byteorder == 1: big endian
1202
1203 If byteorder is 0, the output string will always start with the
1204 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1205 prepended.
1206
1207*/
1208
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001209#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001210PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211 const Py_UNICODE *data, /* Unicode char buffer */
1212 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1213 const char *errors, /* error handling */
1214 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001215 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001216PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1217 PyObject *object, /* Unicode object */
1218 const char *errors, /* error handling */
1219 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1220 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001221#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001222
Guido van Rossumd8225182000-03-10 22:33:05 +00001223/* --- UTF-16 Codecs ------------------------------------------------------ */
1224
Guido van Rossum9e896b32000-04-05 20:11:21 +00001225/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001226 the corresponding Unicode object.
1227
1228 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001230
1231 If byteorder is non-NULL, the decoder starts decoding using the
1232 given byte order:
1233
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 *byteorder == -1: little endian
1235 *byteorder == 0: native order
1236 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001237
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001238 In native mode, the first two bytes of the stream are checked for a
1239 BOM mark. If found, the BOM mark is analysed, the byte order
1240 adjusted and the BOM skipped. In the other modes, no BOM mark
1241 interpretation is done. After completion, *byteorder is set to the
1242 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001243
1244 If byteorder is NULL, the codec starts in native order mode.
1245
1246*/
1247
Mark Hammond91a681d2002-08-12 07:21:58 +00001248PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 const char *string, /* UTF-16 encoded string */
1250 Py_ssize_t length, /* size of string */
1251 const char *errors, /* error handling */
1252 int *byteorder /* pointer to byteorder to use
1253 0=native;-1=LE,1=BE; updated on
1254 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001255 );
1256
Walter Dörwald69652032004-09-07 20:24:22 +00001257PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 const char *string, /* UTF-16 encoded string */
1259 Py_ssize_t length, /* size of string */
1260 const char *errors, /* error handling */
1261 int *byteorder, /* pointer to byteorder to use
1262 0=native;-1=LE,1=BE; updated on
1263 exit */
1264 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001265 );
1266
Guido van Rossumd8225182000-03-10 22:33:05 +00001267/* Returns a Python string using the UTF-16 encoding in native byte
1268 order. The string always starts with a BOM mark. */
1269
Mark Hammond91a681d2002-08-12 07:21:58 +00001270PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001272 );
1273
1274/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001275 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001276
1277 If byteorder is not 0, output is written according to the following
1278 byte order:
1279
1280 byteorder == -1: little endian
1281 byteorder == 0: native byte order (writes a BOM mark)
1282 byteorder == 1: big endian
1283
1284 If byteorder is 0, the output string will always start with the
1285 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1286 prepended.
1287
1288 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1289 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001290 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001291
1292*/
1293
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001294#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001295PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 const Py_UNICODE *data, /* Unicode char buffer */
1297 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1298 const char *errors, /* error handling */
1299 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001300 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001301PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1302 PyObject* unicode, /* Unicode object */
1303 const char *errors, /* error handling */
1304 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1305 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001307
1308/* --- Unicode-Escape Codecs ---------------------------------------------- */
1309
Mark Hammond91a681d2002-08-12 07:21:58 +00001310PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 const char *string, /* Unicode-Escape encoded string */
1312 Py_ssize_t length, /* size of string */
1313 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001314 );
1315
Mark Hammond91a681d2002-08-12 07:21:58 +00001316PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001318 );
1319
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001320#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001321PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 const Py_UNICODE *data, /* Unicode char buffer */
1323 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001324 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001325#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001326
1327/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1328
Mark Hammond91a681d2002-08-12 07:21:58 +00001329PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 const char *string, /* Raw-Unicode-Escape encoded string */
1331 Py_ssize_t length, /* size of string */
1332 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001333 );
1334
Mark Hammond91a681d2002-08-12 07:21:58 +00001335PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001337 );
1338
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001339#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001340PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341 const Py_UNICODE *data, /* Unicode char buffer */
1342 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001343 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001344#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001345
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001346/* --- Unicode Internal Codec ---------------------------------------------
1347
1348 Only for internal use in _codecsmodule.c */
1349
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001350#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001351PyObject *_PyUnicode_DecodeUnicodeInternal(
1352 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001353 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001354 const char *errors
1355 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001356#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001357
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001358/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001359
1360 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1361
1362*/
1363
Mark Hammond91a681d2002-08-12 07:21:58 +00001364PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 const char *string, /* Latin-1 encoded string */
1366 Py_ssize_t length, /* size of string */
1367 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001368 );
1369
Mark Hammond91a681d2002-08-12 07:21:58 +00001370PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001372 );
1373
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001374#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1376 PyObject* unicode,
1377 const char* errors);
1378
Mark Hammond91a681d2002-08-12 07:21:58 +00001379PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 const Py_UNICODE *data, /* Unicode char buffer */
1381 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1382 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001383 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001384#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001385
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001387
1388 Only 7-bit ASCII data is excepted. All other codes generate errors.
1389
1390*/
1391
Mark Hammond91a681d2002-08-12 07:21:58 +00001392PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 const char *string, /* ASCII encoded string */
1394 Py_ssize_t length, /* size of string */
1395 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001396 );
1397
Mark Hammond91a681d2002-08-12 07:21:58 +00001398PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001400 );
1401
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001402#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1404 PyObject* unicode,
1405 const char* errors);
1406
Mark Hammond91a681d2002-08-12 07:21:58 +00001407PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 const Py_UNICODE *data, /* Unicode char buffer */
1409 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1410 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001411 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001412#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001413
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001415
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001417
1418 Decoding mappings must map single string characters to single
1419 Unicode characters, integers (which are then interpreted as Unicode
1420 ordinals) or None (meaning "undefined mapping" and causing an
1421 error).
1422
1423 Encoding mappings must map single Unicode characters to single
1424 string characters, integers (which are then interpreted as Latin-1
1425 ordinals) or None (meaning "undefined mapping" and causing an
1426 error).
1427
1428 If a character lookup fails with a LookupError, the character is
1429 copied as-is meaning that its ordinal value will be interpreted as
1430 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1431 to contain those mappings which map characters to different code
1432 points.
1433
1434*/
1435
Mark Hammond91a681d2002-08-12 07:21:58 +00001436PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 const char *string, /* Encoded string */
1438 Py_ssize_t length, /* size of string */
1439 PyObject *mapping, /* character mapping
1440 (char ordinal -> unicode ordinal) */
1441 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001442 );
1443
Mark Hammond91a681d2002-08-12 07:21:58 +00001444PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001445 PyObject *unicode, /* Unicode object */
1446 PyObject *mapping /* character mapping
1447 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001448 );
1449
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001450#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001451PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 const Py_UNICODE *data, /* Unicode char buffer */
1453 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1454 PyObject *mapping, /* character mapping
1455 (unicode ordinal -> char ordinal) */
1456 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001457 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001458PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1459 PyObject *unicode, /* Unicode object */
1460 PyObject *mapping, /* character mapping
1461 (unicode ordinal -> char ordinal) */
1462 const char *errors /* error handling */
1463 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001464#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001465
1466/* Translate a Py_UNICODE buffer of the given length by applying a
1467 character mapping table to it and return the resulting Unicode
1468 object.
1469
1470 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001472
1473 Mapping tables may be dictionaries or sequences. Unmapped character
1474 ordinals (ones which cause a LookupError) are left untouched and
1475 are copied as-is.
1476
1477*/
1478
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001479#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001480PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 const Py_UNICODE *data, /* Unicode char buffer */
1482 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1483 PyObject *table, /* Translate table */
1484 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001485 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001486#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001487
Victor Stinner99b95382011-07-04 14:23:54 +02001488#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001489
Guido van Rossumefec1152000-03-28 02:01:15 +00001490/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001491
Mark Hammond91a681d2002-08-12 07:21:58 +00001492PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001493 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001494 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001495 const char *errors /* error handling */
1496 );
1497
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001498PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1499 const char *string, /* MBCS encoded string */
1500 Py_ssize_t length, /* size of string */
1501 const char *errors, /* error handling */
1502 Py_ssize_t *consumed /* bytes consumed */
1503 );
1504
Victor Stinner3a50e702011-10-18 21:21:00 +02001505PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1506 int code_page, /* code page number */
1507 const char *string, /* encoded string */
1508 Py_ssize_t length, /* size of string */
1509 const char *errors, /* error handling */
1510 Py_ssize_t *consumed /* bytes consumed */
1511 );
1512
Mark Hammond91a681d2002-08-12 07:21:58 +00001513PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001514 PyObject *unicode /* Unicode object */
1515 );
1516
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001517#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001518PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001519 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001520 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001521 const char *errors /* error handling */
1522 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001523#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001524
Victor Stinner3a50e702011-10-18 21:21:00 +02001525PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1526 int code_page, /* code page number */
1527 PyObject *unicode, /* Unicode object */
1528 const char *errors /* error handling */
1529 );
1530
Victor Stinner99b95382011-07-04 14:23:54 +02001531#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001532
Guido van Rossum9e896b32000-04-05 20:11:21 +00001533/* --- Decimal Encoder ---------------------------------------------------- */
1534
1535/* Takes a Unicode string holding a decimal value and writes it into
1536 an output buffer using standard ASCII digit codes.
1537
1538 The output buffer has to provide at least length+1 bytes of storage
1539 area. The output string is 0-terminated.
1540
1541 The encoder converts whitespace to ' ', decimal characters to their
1542 corresponding ASCII digit and all other Latin-1 characters except
1543 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1544 are treated as errors. This includes embedded NULL bytes.
1545
1546 Error handling is defined by the errors argument:
1547
1548 NULL or "strict": raise a ValueError
1549 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001550 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001551 "replace": replaces illegal characters with '?'
1552
1553 Returns 0 on success, -1 on failure.
1554
1555*/
1556
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001557#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001558PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 Py_UNICODE *s, /* Unicode buffer */
1560 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1561 char *output, /* Output buffer; must have size >= length */
1562 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001563 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001564#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001565
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001566/* Transforms code points that have decimal digit property to the
1567 corresponding ASCII digit code points.
1568
1569 Returns a new Unicode string on success, NULL on failure.
1570*/
1571
Georg Brandlb5503082010-12-05 11:40:48 +00001572#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001573PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1574 Py_UNICODE *s, /* Unicode buffer */
1575 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1576 );
Georg Brandlb5503082010-12-05 11:40:48 +00001577#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001578
Victor Stinner6f9568b2011-11-17 00:12:44 +01001579/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 as argument instead of a raw buffer and length. This function additionally
1581 transforms spaces to ASCII because this is what the callers in longobject,
1582 floatobject, and complexobject did anyways. */
1583
1584#ifndef Py_LIMITED_API
1585PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1586 PyObject *unicode /* Unicode object */
1587 );
1588#endif
1589
Martin v. Löwis011e8422009-05-05 04:43:17 +00001590/* --- File system encoding ---------------------------------------------- */
1591
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001592/* ParseTuple converter: encode str objects to bytes using
1593 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594
1595PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1596
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001597/* ParseTuple converter: decode bytes objects to unicode using
1598 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1599
1600PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1601
Victor Stinner77c38622010-05-14 15:58:55 +00001602/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1603 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001604
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001605 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1606 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001607
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001608 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001609*/
1610
1611PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1612 const char *s /* encoded string */
1613 );
1614
Victor Stinner77c38622010-05-14 15:58:55 +00001615/* Decode a string using Py_FileSystemDefaultEncoding
1616 and the "surrogateescape" error handler.
1617
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001618 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1619 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001620*/
1621
Martin v. Löwis011e8422009-05-05 04:43:17 +00001622PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1623 const char *s, /* encoded string */
1624 Py_ssize_t size /* size */
1625 );
1626
Victor Stinnerae6265f2010-05-15 16:27:27 +00001627/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001628 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001629
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001630 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1631 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001632*/
1633
1634PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1635 PyObject *unicode
1636 );
1637
Guido van Rossumd8225182000-03-10 22:33:05 +00001638/* --- Methods & Slots ----------------------------------------------------
1639
1640 These are capable of handling Unicode objects and strings on input
1641 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001642 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001643
1644/* Concat two strings giving a new Unicode string. */
1645
Mark Hammond91a681d2002-08-12 07:21:58 +00001646PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 PyObject *left, /* Left string */
1648 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001649 );
1650
Walter Dörwald1ab83302007-05-18 17:15:44 +00001651/* Concat two strings and put the result in *pleft
1652 (sets *pleft to NULL on error) */
1653
1654PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 PyObject **pleft, /* Pointer to left string */
1656 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001657 );
1658
1659/* Concat two strings, put the result in *pleft and drop the right object
1660 (sets *pleft to NULL on error) */
1661
1662PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001663 PyObject **pleft, /* Pointer to left string */
1664 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001665 );
1666
Guido van Rossumd8225182000-03-10 22:33:05 +00001667/* Split a string giving a list of Unicode strings.
1668
1669 If sep is NULL, splitting will be done at all whitespace
1670 substrings. Otherwise, splits occur at the given separator.
1671
1672 At most maxsplit splits will be done. If negative, no limit is set.
1673
1674 Separators are not included in the resulting list.
1675
1676*/
1677
Mark Hammond91a681d2002-08-12 07:21:58 +00001678PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001679 PyObject *s, /* String to split */
1680 PyObject *sep, /* String separator */
1681 Py_ssize_t maxsplit /* Maxsplit count */
1682 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001683
1684/* Dito, but split at line breaks.
1685
1686 CRLF is considered to be one line break. Line breaks are not
1687 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688
Mark Hammond91a681d2002-08-12 07:21:58 +00001689PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690 PyObject *s, /* String to split */
1691 int keepends /* If true, line end markers are included */
1692 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001693
Thomas Wouters477c8d52006-05-27 19:21:47 +00001694/* Partition a string using a given separator. */
1695
1696PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 PyObject *s, /* String to partition */
1698 PyObject *sep /* String separator */
1699 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001700
1701/* Partition a string using a given separator, searching from the end of the
1702 string. */
1703
1704PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 PyObject *s, /* String to partition */
1706 PyObject *sep /* String separator */
1707 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001708
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001709/* Split a string giving a list of Unicode strings.
1710
1711 If sep is NULL, splitting will be done at all whitespace
1712 substrings. Otherwise, splits occur at the given separator.
1713
1714 At most maxsplit splits will be done. But unlike PyUnicode_Split
1715 PyUnicode_RSplit splits from the end of the string. If negative,
1716 no limit is set.
1717
1718 Separators are not included in the resulting list.
1719
1720*/
1721
1722PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001723 PyObject *s, /* String to split */
1724 PyObject *sep, /* String separator */
1725 Py_ssize_t maxsplit /* Maxsplit count */
1726 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001727
Guido van Rossumd8225182000-03-10 22:33:05 +00001728/* Translate a string by applying a character mapping table to it and
1729 return the resulting Unicode object.
1730
1731 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001732 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001733
1734 Mapping tables may be dictionaries or sequences. Unmapped character
1735 ordinals (ones which cause a LookupError) are left untouched and
1736 are copied as-is.
1737
1738*/
1739
Mark Hammond91a681d2002-08-12 07:21:58 +00001740PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001741 PyObject *str, /* String */
1742 PyObject *table, /* Translate table */
1743 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001744 );
1745
1746/* Join a sequence of strings using the given separator and return
1747 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001748
Mark Hammond91a681d2002-08-12 07:21:58 +00001749PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750 PyObject *separator, /* Separator string */
1751 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001752 );
1753
1754/* Return 1 if substr matches str[start:end] at the given tail end, 0
1755 otherwise. */
1756
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 PyObject *str, /* String */
1759 PyObject *substr, /* Prefix or Suffix string */
1760 Py_ssize_t start, /* Start index */
1761 Py_ssize_t end, /* Stop index */
1762 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001763 );
1764
1765/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001766 given search direction or -1 if not found. -2 is returned in case
1767 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001768
Martin v. Löwis18e16552006-02-15 17:27:45 +00001769PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001770 PyObject *str, /* String */
1771 PyObject *substr, /* Substring to find */
1772 Py_ssize_t start, /* Start index */
1773 Py_ssize_t end, /* Stop index */
1774 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001775 );
1776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777/* Like PyUnicode_Find, but search for single character only. */
1778PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1779 PyObject *str,
1780 Py_UCS4 ch,
1781 Py_ssize_t start,
1782 Py_ssize_t end,
1783 int direction
1784 );
1785
Barry Warsaw51ac5802000-03-20 16:36:48 +00001786/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001787
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001789 PyObject *str, /* String */
1790 PyObject *substr, /* Substring to count */
1791 Py_ssize_t start, /* Start index */
1792 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001793 );
1794
Barry Warsaw51ac5802000-03-20 16:36:48 +00001795/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001796 and return the resulting Unicode object. */
1797
Mark Hammond91a681d2002-08-12 07:21:58 +00001798PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001799 PyObject *str, /* String */
1800 PyObject *substr, /* Substring to find */
1801 PyObject *replstr, /* Substring to replace */
1802 Py_ssize_t maxcount /* Max. number of replacements to apply;
1803 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001804 );
1805
1806/* Compare two strings and return -1, 0, 1 for less than, equal,
1807 greater than resp. */
1808
Mark Hammond91a681d2002-08-12 07:21:58 +00001809PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001810 PyObject *left, /* Left string */
1811 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001812 );
1813
Martin v. Löwis5b222132007-06-10 09:51:05 +00001814PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1815 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001816 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001817 );
1818
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001819/* Rich compare two strings and return one of the following:
1820
1821 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001822 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001823 - Py_NotImplemented in case the type combination is unknown
1824
1825 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1826 case the conversion of the arguments to Unicode fails with a
1827 UnicodeDecodeError.
1828
1829 Possible values for op:
1830
1831 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1832
1833*/
1834
1835PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001836 PyObject *left, /* Left string */
1837 PyObject *right, /* Right string */
1838 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001839 );
1840
Thomas Wouters7e474022000-07-16 12:04:32 +00001841/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001842 the resulting Unicode string. */
1843
Mark Hammond91a681d2002-08-12 07:21:58 +00001844PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001845 PyObject *format, /* Format string */
1846 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001847 );
1848
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001849/* Checks whether element is contained in container and return 1/0
1850 accordingly.
1851
1852 element has to coerce to an one element Unicode string. -1 is
1853 returned in case of an error. */
1854
Mark Hammond91a681d2002-08-12 07:21:58 +00001855PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001856 PyObject *container, /* Container string */
1857 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001858 );
1859
Martin v. Löwis47383402007-08-15 07:32:56 +00001860/* Checks whether argument is a valid identifier. */
1861
1862PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1863
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001864#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001865/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001866PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001867 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001868 int striptype,
1869 PyObject *sepobj
1870 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001871#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001872
Eric Smith5807c412008-05-11 21:00:57 +00001873/* Using the current locale, insert the thousands grouping
1874 into the string pointed to by buffer. For the argument descriptions,
1875 see Objects/stringlib/localeutil.h */
1876
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001877#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001878PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1879 Py_ssize_t n_buffer,
1880 Py_UNICODE *digits,
1881 Py_ssize_t n_digits,
1882 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001883#endif
Eric Smith5807c412008-05-11 21:00:57 +00001884
Eric Smitha3b1ac82009-04-03 14:45:06 +00001885/* Using explicit passed-in values, insert the thousands grouping
1886 into the string pointed to by buffer. For the argument descriptions,
1887 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001888#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001890 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 int kind,
1892 void *buffer,
1893 Py_ssize_t n_buffer,
1894 void *digits,
1895 Py_ssize_t n_digits,
1896 Py_ssize_t min_width,
1897 const char *grouping,
1898 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001899#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001900/* === Characters Type APIs =============================================== */
1901
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001902/* Helper array used by Py_UNICODE_ISSPACE(). */
1903
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001904#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001905PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1906
Guido van Rossumd8225182000-03-10 22:33:05 +00001907/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001908 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001909
1910 These APIs are implemented in Objects/unicodectype.c.
1911
1912*/
1913
Mark Hammond91a681d2002-08-12 07:21:58 +00001914PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001915 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001916 );
1917
Mark Hammond91a681d2002-08-12 07:21:58 +00001918PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001919 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001920 );
1921
Mark Hammond91a681d2002-08-12 07:21:58 +00001922PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001923 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001924 );
1925
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001926PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001928 );
1929
1930PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001931 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001932 );
1933
Mark Hammond91a681d2002-08-12 07:21:58 +00001934PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001935 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Mark Hammond91a681d2002-08-12 07:21:58 +00001938PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001939 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001942PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1943 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001946PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1947 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001948 );
1949
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001950PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1951 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001952 );
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001956 );
1957
Mark Hammond91a681d2002-08-12 07:21:58 +00001958PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001959 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960 );
1961
Mark Hammond91a681d2002-08-12 07:21:58 +00001962PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001963 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964 );
1965
Mark Hammond91a681d2002-08-12 07:21:58 +00001966PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001967 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001968 );
1969
Mark Hammond91a681d2002-08-12 07:21:58 +00001970PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001971 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001972 );
1973
Mark Hammond91a681d2002-08-12 07:21:58 +00001974PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001975 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001976 );
1977
Georg Brandl559e5d72008-06-11 18:37:52 +00001978PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001979 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001980 );
1981
Mark Hammond91a681d2002-08-12 07:21:58 +00001982PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001983 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001984 );
1985
Victor Stinneref8d95c2010-08-16 22:03:11 +00001986PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1987 const Py_UNICODE *u
1988 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001989
1990PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001991 Py_UNICODE *s1,
1992 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001993
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001994PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1995 Py_UNICODE *s1, const Py_UNICODE *s2);
1996
Martin v. Löwis5b222132007-06-10 09:51:05 +00001997PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001998 Py_UNICODE *s1,
1999 const Py_UNICODE *s2,
2000 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002001
2002PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002003 const Py_UNICODE *s1,
2004 const Py_UNICODE *s2
2005 );
2006
2007PyAPI_FUNC(int) Py_UNICODE_strncmp(
2008 const Py_UNICODE *s1,
2009 const Py_UNICODE *s2,
2010 size_t n
2011 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002012
2013PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002014 const Py_UNICODE *s,
2015 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002016 );
2017
Victor Stinner331ea922010-08-10 16:37:20 +00002018PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002019 const Py_UNICODE *s,
2020 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002021 );
2022
Victor Stinner71133ff2010-09-01 23:43:53 +00002023/* Create a copy of a unicode string ending with a nul character. Return NULL
2024 and raise a MemoryError exception on memory allocation failure, otherwise
2025 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2026
Victor Stinner46408602010-09-03 16:18:00 +00002027PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002028 PyObject *unicode
2029 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002030#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002031
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002032#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002033PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002034 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002035 int check_content);
2036#endif
2037
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002038/********************* String Literals ****************************************/
2039/* This structure helps managing static strings. The basic usage goes like this:
2040 Instead of doing
2041
2042 r = PyObject_CallMethod(o, "foo", "args", ...);
2043
2044 do
2045
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002046 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002047 ...
2048 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2049
2050 PyId_foo is a static variable, either on block level or file level. On first
2051 usage, the string "foo" is interned, and the structures are linked. On interpreter
2052 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2053
2054 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002055 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002056 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2057*/
2058typedef struct _Py_Identifier {
2059 struct _Py_Identifier *next;
2060 const char* string;
2061 PyObject *object;
2062} _Py_Identifier;
2063
Martin v. Löwis87da8722011-10-09 11:54:42 +02002064#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002065#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002066
2067/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2068PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2069/* Clear all static strings. */
2070PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2071
Guido van Rossumd8225182000-03-10 22:33:05 +00002072#ifdef __cplusplus
2073}
2074#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002075#endif /* !Py_UNICODEOBJECT_H */