blob: 71404f3ae798647d7c2706a3c8edf62699db4ce7 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000191/* Check if substring matches at given offset. The offset must be
192 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200195 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
196 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
197 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
198
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000199#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000200
Barry Warsaw51ac5802000-03-20 16:36:48 +0000201#ifdef __cplusplus
202extern "C" {
203#endif
204
Guido van Rossumd8225182000-03-10 22:33:05 +0000205/* --- Unicode Type ------------------------------------------------------- */
206
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000207#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200208
209/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
210 structure. state.ascii and state.compact are set, and the data
211 immediately follow the structure. utf8_length and wstr_length can be found
212 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000213typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200214 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200215
216 - compact ascii:
217
218 * structure = PyASCIIObject
219 * kind = PyUnicode_1BYTE_KIND
220 * compact = 1
221 * ascii = 1
222 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200223 * (length is the length of the utf8 and wstr strings)
224 * (data starts just after the structure)
225 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200226
227 - compact:
228
229 * structure = PyCompactUnicodeObject
230 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
231 PyUnicode_4BYTE_KIND
232 * compact = 1
233 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200234 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200235 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200236 * utf8_length = 0 if utf8 is NULL
237 * wstr is shared with data and wstr_length=length
238 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100239 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200240 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200242
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200243 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200244
245 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100246 * length = 0 (use wstr_length)
247 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200248 * kind = PyUnicode_WCHAR_KIND
249 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200250 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200251 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100252 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200253 * wstr is not NULL
254 * data.any is NULL
255 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200256 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200257
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200258 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200259
260 * structure = PyUnicodeObject structure
261 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
262 PyUnicode_4BYTE_KIND
263 * compact = 0
264 * ready = 1
265 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200266 * utf8 is shared and utf8_length = length with data.any if ascii = 1
267 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100268 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200269 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
270 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
271 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200272
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200273 Compact strings use only one memory block (structure + characters),
274 whereas legacy strings use one block for the structure and one block
275 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200276
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200277 Legacy strings are created by PyUnicode_FromUnicode() and
278 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
279 when PyUnicode_READY() is called.
280
281 See also _PyUnicode_CheckConsistency().
282 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000283 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200284 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000285 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286 struct {
287 /*
288 SSTATE_NOT_INTERNED (0)
289 SSTATE_INTERNED_MORTAL (1)
290 SSTATE_INTERNED_IMMORTAL (2)
291
292 If interned != SSTATE_NOT_INTERNED, the two references from the
293 dictionary to this object are *not* counted in ob_refcnt.
294 */
295 unsigned int interned:2;
296 /* Character size:
297
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200298 - PyUnicode_WCHAR_KIND (0):
299
300 * character type = wchar_t (16 or 32 bits, depending on the
301 platform)
302
303 - PyUnicode_1BYTE_KIND (1):
304
305 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200306 * if ascii is set, all characters must be in range
307 U+0000-U+007F, otherwise at least one character must be in range
308 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200309
310 - PyUnicode_2BYTE_KIND (2):
311
312 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200313 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200314
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200315 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200316
317 * character type = Py_UCS4 (32 bits, unsigned)
318 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200319 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200320 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 /* Compact is with respect to the allocation scheme. Compact unicode
322 objects only require one memory block while non-compact objects use
323 one block for the PyUnicodeObject struct and another for its data
324 buffer. */
325 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200326 /* The string only contains characters in range U+0000-U+007F (ASCII)
327 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
328 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 unsigned int ascii:1;
330 /* The ready flag indicates whether the object layout is initialized
331 completely. This means that this is either a compact object, or
332 the data pointer is filled out. The bit is redundant, and helps
333 to minimize the test in PyUnicode_IS_READY(). */
334 unsigned int ready:1;
335 } state;
336 wchar_t *wstr; /* wchar_t representation (null-terminated) */
337} PyASCIIObject;
338
339/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200340 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200341 immediately follow the structure. */
342typedef struct {
343 PyASCIIObject _base;
344 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
345 * terminating \0. */
346 char *utf8; /* UTF-8 representation (null-terminated) */
347 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
348 * surrogates count as two code points. */
349} PyCompactUnicodeObject;
350
351/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
352 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200353 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354typedef struct {
355 PyCompactUnicodeObject _base;
356 union {
357 void *any;
358 Py_UCS1 *latin1;
359 Py_UCS2 *ucs2;
360 Py_UCS4 *ucs4;
361 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000362} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000363#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
Mark Hammond91a681d2002-08-12 07:21:58 +0000365PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000366PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000367
Thomas Wouters27d517b2007-02-25 20:39:11 +0000368#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000369 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
370#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000371
372/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000373#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200374
375#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200376 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 ((PyASCIIObject*)op)->length : \
378 ((PyCompactUnicodeObject*)op)->wstr_length)
379
380/* Returns the deprecated Py_UNICODE representation's size in code units
381 (this includes surrogate pairs as 2 units).
382 If the Py_UNICODE representation is not available, it will be computed
383 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
384
Guido van Rossumd8225182000-03-10 22:33:05 +0000385#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 (assert(PyUnicode_Check(op)), \
387 (((PyASCIIObject *)(op))->wstr) ? \
388 PyUnicode_WSTR_LENGTH(op) : \
389 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
390 PyUnicode_WSTR_LENGTH(op)))
391
Guido van Rossumd8225182000-03-10 22:33:05 +0000392#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
394
395/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
396 representation on demand. Using this macro is very inefficient now,
397 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
398 use PyUnicode_WRITE() and PyUnicode_READ(). */
399
Guido van Rossumd8225182000-03-10 22:33:05 +0000400#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 (assert(PyUnicode_Check(op)), \
402 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
403 PyUnicode_AsUnicode((PyObject *)(op)))
404
Guido van Rossumd8225182000-03-10 22:33:05 +0000405#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406 ((const char *)(PyUnicode_AS_UNICODE(op)))
407
408
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200409/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410
Victor Stinner6f9568b2011-11-17 00:12:44 +0100411/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200412
413/* Interning state. */
414#define SSTATE_NOT_INTERNED 0
415#define SSTATE_INTERNED_MORTAL 1
416#define SSTATE_INTERNED_IMMORTAL 2
417
Victor Stinnera3b334d2011-10-03 13:53:37 +0200418/* Return true if the string contains only ASCII characters, or 0 if not. The
419 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
420 or Ready calls are performed. */
421#define PyUnicode_IS_ASCII(op) \
422 (((PyASCIIObject*)op)->state.ascii)
423
424/* Return true if the string is compact or 0 if not.
425 No type checks or Ready calls are performed. */
426#define PyUnicode_IS_COMPACT(op) \
427 (((PyASCIIObject*)(op))->state.compact)
428
429/* Return true if the string is a compact ASCII string (use PyASCIIObject
430 structure), or 0 if not. No type checks or Ready calls are performed. */
431#define PyUnicode_IS_COMPACT_ASCII(op) \
432 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200434enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200436 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200437 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200438 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200440 PyUnicode_1BYTE_KIND = 1,
441 PyUnicode_2BYTE_KIND = 2,
442 PyUnicode_4BYTE_KIND = 4
443};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444
Georg Brandl4975a9b2011-10-05 16:12:21 +0200445/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200447 No checks are performed, use PyUnicode_KIND() before to ensure
448 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449
450#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
451#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
452#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
453
Victor Stinner157f83f2011-09-28 21:41:31 +0200454/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define PyUnicode_KIND(op) \
456 (assert(PyUnicode_Check(op)), \
457 assert(PyUnicode_IS_READY(op)), \
458 ((PyASCIIObject *)(op))->state.kind)
459
Victor Stinner157f83f2011-09-28 21:41:31 +0200460/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200462 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463 ((void*)((PyASCIIObject*)(op) + 1)) : \
464 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
465
466#define _PyUnicode_NONCOMPACT_DATA(op) \
467 (assert(((PyUnicodeObject*)(op))->data.any), \
468 ((((PyUnicodeObject *)(op))->data.any)))
469
470#define PyUnicode_DATA(op) \
471 (assert(PyUnicode_Check(op)), \
472 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
473 _PyUnicode_NONCOMPACT_DATA(op))
474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475/* In the access macros below, "kind" may be evaluated more than once.
476 All other macro parameters are evaluated exactly once, so it is safe
477 to put side effects into them (such as increasing the index). */
478
479/* Write into the canonical representation, this macro does not do any sanity
480 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200481 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200482 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200483 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200484#define PyUnicode_WRITE(kind, data, index, value) \
485 do { \
486 switch ((kind)) { \
487 case PyUnicode_1BYTE_KIND: { \
488 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
489 break; \
490 } \
491 case PyUnicode_2BYTE_KIND: { \
492 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
493 break; \
494 } \
495 default: { \
496 assert((kind) == PyUnicode_4BYTE_KIND); \
497 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
498 } \
499 } \
500 } while (0)
501
Georg Brandl07de3252011-10-05 16:47:38 +0200502/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200503 or ready calls are performed. */
504#define PyUnicode_READ(kind, data, index) \
505 ((Py_UCS4) \
506 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200507 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200508 ((kind) == PyUnicode_2BYTE_KIND ? \
509 ((const Py_UCS2 *)(data))[(index)] : \
510 ((const Py_UCS4 *)(data))[(index)] \
511 ) \
512 ))
513
514/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
515 calls PyUnicode_KIND() and might call it twice. For single reads, use
516 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
517 cache kind and use PyUnicode_READ instead. */
518#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200519 (assert(PyUnicode_Check(unicode)), \
520 assert(PyUnicode_IS_READY(unicode)), \
521 (Py_UCS4) \
522 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
523 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
524 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
525 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
526 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
527 ) \
528 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529
530/* Returns the length of the unicode string. The caller has to make sure that
531 the string has it's canonical representation set before calling
532 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
533#define PyUnicode_GET_LENGTH(op) \
534 (assert(PyUnicode_Check(op)), \
535 assert(PyUnicode_IS_READY(op)), \
536 ((PyASCIIObject *)(op))->length)
537
538
539/* Fast check to determine whether an object is ready. Equivalent to
540 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
541
542#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
543
Victor Stinnera3b334d2011-10-03 13:53:37 +0200544/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200545 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200546 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 Returns 0 on success and -1 on errors. */
548#define PyUnicode_READY(op) \
549 (assert(PyUnicode_Check(op)), \
550 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200551 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553/* Return a maximum character value which is suitable for creating another
554 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200555 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556#define PyUnicode_MAX_CHAR_VALUE(op) \
557 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200558 (PyUnicode_IS_ASCII(op) ? \
559 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200561 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200563 (0xffffU) : \
564 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000566#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000567
568/* --- Constants ---------------------------------------------------------- */
569
570/* This Unicode character will be used as replacement character during
571 decoding if the errors argument is set to "replace". Note: the
572 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
573 Unicode 3.0. */
574
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200575#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000576
577/* === Public API ========================================================= */
578
579/* --- Plain Py_UNICODE --------------------------------------------------- */
580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581/* With PEP 393, this is the recommended way to allocate a new unicode object.
582 This function will allocate the object and its buffer in a single memory
583 block. Objects created using this function are not resizable. */
584#ifndef Py_LIMITED_API
585PyAPI_FUNC(PyObject*) PyUnicode_New(
586 Py_ssize_t size, /* Number of code points in the new string */
587 Py_UCS4 maxchar /* maximum code point value in the string */
588 );
589#endif
590
Victor Stinnerd8f65102011-09-29 19:43:17 +0200591/* Initializes the canonical string representation from a the deprecated
592 wstr/Py_UNICODE representation. This function is used to convert Unicode
593 objects which were created using the old API to the new flexible format
594 introduced with PEP 393.
595
596 Don't call this function directly, use the public PyUnicode_READY() macro
597 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598#ifndef Py_LIMITED_API
599PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200600 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601 );
602#endif
603
Victor Stinner034f6cf2011-09-30 02:26:44 +0200604/* Get a copy of a Unicode string. */
605PyAPI_FUNC(PyObject*) PyUnicode_Copy(
606 PyObject *unicode
607 );
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200610 character conversion when necessary and falls back to memcpy if possible.
611
Victor Stinnera0702ab2011-09-29 14:14:38 +0200612 Fail if to is too small (smaller than how_many or smaller than
613 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
614 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200615
616 Return the number of written character, or return -1 and raise an exception
617 on error.
618
619 Pseudo-code:
620
621 how_many = min(how_many, len(from) - from_start)
622 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
623 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200624
625 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200626 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200627#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200628PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629 PyObject *to,
630 Py_ssize_t to_start,
631 PyObject *from,
632 Py_ssize_t from_start,
633 Py_ssize_t how_many
634 );
635#endif
636
Guido van Rossumd8225182000-03-10 22:33:05 +0000637/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000638 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000639
640 u may be NULL which causes the contents to be undefined. It is the
641 user's responsibility to fill in the needed data afterwards. Note
642 that modifying the Unicode object contents after construction is
643 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000644
645 The buffer is copied into the new object. */
646
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000647#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000648PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000649 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000650 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000651 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000652#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000653
Georg Brandl952867a2010-06-27 10:17:12 +0000654/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000656 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000657 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000658 );
659
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000660/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000662PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000663 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000664 );
665
Victor Stinnerb9275c12011-10-05 14:01:42 +0200666/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
667 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200668#ifndef Py_LIMITED_API
669PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
670 int kind,
671 const void *buffer,
672 Py_ssize_t size);
673#endif
674
675PyAPI_FUNC(PyObject*) PyUnicode_Substring(
676 PyObject *str,
677 Py_ssize_t start,
678 Py_ssize_t end);
679
Georg Brandldb6c7f52011-10-07 11:19:11 +0200680/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200681 is set. Return NULL and raise an exception on error. Raise a ValueError if
682 the buffer is smaller than the string. Return buffer on success.
683
684 buflen is the length of the buffer in (Py_UCS4) characters. */
685PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
686 PyObject *unicode,
687 Py_UCS4* buffer,
688 Py_ssize_t buflen,
689 int copy_null);
690
691/* Copy the string into a UCS4 buffer. A new buffer is allocated using
692 * PyMem_Malloc; if this fails, NULL is returned with a memory error
693 exception set. */
694PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
695
Guido van Rossumd8225182000-03-10 22:33:05 +0000696/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 Py_UNICODE buffer.
698 If the wchar_t/Py_UNICODE representation is not yet available, this
699 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000700
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000701#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000702PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000703 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000704 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000705#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707/* Return a read-only pointer to the Unicode object's internal
708 Py_UNICODE buffer and save the length at size.
709 If the wchar_t/Py_UNICODE representation is not yet available, this
710 function will calculate it. */
711
712#ifndef Py_LIMITED_API
713PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
714 PyObject *unicode, /* Unicode object */
715 Py_ssize_t *size /* location where to save the length */
716 );
717#endif
718
Guido van Rossumd8225182000-03-10 22:33:05 +0000719/* Get the length of the Unicode object. */
720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
722 PyObject *unicode
723);
724
Victor Stinner157f83f2011-09-28 21:41:31 +0200725/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726 string representation. */
727
Martin v. Löwis18e16552006-02-15 17:27:45 +0000728PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000730 );
731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732/* Read a character from the string. */
733
734PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
735 PyObject *unicode,
736 Py_ssize_t index
737 );
738
739/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200740 PyUnicode_New, must not be shared, and must not have been hashed yet.
741
742 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200743
744PyAPI_FUNC(int) PyUnicode_WriteChar(
745 PyObject *unicode,
746 Py_ssize_t index,
747 Py_UCS4 character
748 );
749
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000750#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000751/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000752PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000753#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000754
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200755/* Resize an Unicode object allocated by the legacy API (e.g.
756 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
757 PyUnicode_New) cannot be resized by this function.
758
Victor Stinner93439992011-11-20 18:29:14 +0100759 The length is a number of characters (and not the number of Py_UNICODE characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000760
761 *unicode is modified to point to the new (resized) object and 0
762 returned on success.
763
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200764 If the refcount on the object is 1, the function resizes the string in
765 place, which is usually faster than allocating a new string (and copy
766 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000767
768 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200769 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000770
Mark Hammond91a681d2002-08-12 07:21:58 +0000771PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 PyObject **unicode, /* Pointer to the Unicode object */
773 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000774 );
775
Guido van Rossumd8225182000-03-10 22:33:05 +0000776/* Coerce obj to an Unicode object and return a reference with
777 *incremented* refcount.
778
779 Coercion is done in the following way:
780
Georg Brandl952867a2010-06-27 10:17:12 +0000781 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000782 under the assumptions that they contain data using the UTF-8
783 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000784
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000785 2. All other objects (including Unicode objects) raise an
786 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000787
788 The API returns NULL in case of an error. The caller is responsible
789 for decref'ing the returned objects.
790
791*/
792
Mark Hammond91a681d2002-08-12 07:21:58 +0000793PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000795 const char *encoding, /* encoding */
796 const char *errors /* error handling */
797 );
798
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000799/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000800 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000802 Unicode objects are passed back as-is (subclasses are converted to
803 true Unicode objects), all other objects are delegated to
804 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000805 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000806
807 The API returns NULL in case of an error. The caller is responsible
808 for decref'ing the returned objects.
809
810*/
811
Mark Hammond91a681d2002-08-12 07:21:58 +0000812PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000814 );
815
Victor Stinner1205f272010-09-11 00:54:47 +0000816PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
817 const char *format, /* ASCII-encoded string */
818 va_list vargs
819 );
820PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
821 const char *format, /* ASCII-encoded string */
822 ...
823 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000824
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000825#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000826/* Format the object based on the format_spec, as defined in PEP 3101
827 (Advanced String Formatting). */
828PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 PyObject *format_spec,
830 Py_ssize_t start,
831 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000832#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000833
Walter Dörwald16807132007-05-25 13:52:07 +0000834PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
835PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000836PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
837 const char *u /* UTF-8 encoded string */
838 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000839#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000840PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000841#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000842
843/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844#define PyUnicode_CHECK_INTERNED(op) \
845 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000846
Guido van Rossumd8225182000-03-10 22:33:05 +0000847/* --- wchar_t support for platforms which support it --------------------- */
848
849#ifdef HAVE_WCHAR_H
850
Georg Brandl952867a2010-06-27 10:17:12 +0000851/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000852 size.
853
854 The buffer is copied into the new object. */
855
Mark Hammond91a681d2002-08-12 07:21:58 +0000856PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000857 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000859 );
860
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000861/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000862 most size wchar_t characters are copied.
863
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000864 Note that the resulting wchar_t string may or may not be
865 0-terminated. It is the responsibility of the caller to make sure
866 that the wchar_t string is 0-terminated in case this is required by
867 the application.
868
869 Returns the number of wchar_t characters copied (excluding a
870 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000871 error. */
872
Martin v. Löwis18e16552006-02-15 17:27:45 +0000873PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000874 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000875 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000876 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000877 );
878
Victor Stinner137c34c2010-09-29 10:25:54 +0000879/* Convert the Unicode object to a wide character string. The output string
880 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200881 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000882
883 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
884 on success. On error, returns NULL, *size is undefined and raises a
885 MemoryError. */
886
887PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000888 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000889 Py_ssize_t *size /* number of characters of the result */
890 );
891
Victor Stinner9f789e72011-10-01 03:57:28 +0200892#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200894#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895
Guido van Rossumd8225182000-03-10 22:33:05 +0000896#endif
897
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000898/* --- Unicode ordinals --------------------------------------------------- */
899
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900/* Create a Unicode Object from the given Unicode code point ordinal.
901
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000902 The ordinal must be in range(0x10000) on narrow Python builds
903 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
904 raised in case it is not.
905
906*/
907
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000908PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000909
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000910/* --- Free-list management ----------------------------------------------- */
911
912/* Clear the free list used by the Unicode implementation.
913
914 This can be used to release memory used for objects on the free
915 list back to the Python memory allocator.
916
917*/
918
919PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
920
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000922
923 Many of these APIs take two arguments encoding and errors. These
924 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000925 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000926
Georg Brandl952867a2010-06-27 10:17:12 +0000927 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000928
929 Error handling is set by errors which may also be set to NULL
930 meaning to use the default handling defined for the codec. Default
931 error handling for all builtin codecs is "strict" (ValueErrors are
932 raised).
933
934 The codecs all use a similar interface. Only deviation from the
935 generic ones are documented.
936
937*/
938
Fred Drakecb093fe2000-05-09 19:51:53 +0000939/* --- Manage the default encoding ---------------------------------------- */
940
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000941/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000942 Unicode object unicode and the size of the encoded representation
943 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000944
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000945 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000946
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200947 This function caches the UTF-8 encoded string in the unicodeobject
948 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 when the unicodeobject is deallocated.
950
951 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
952 support the previous internal function with the same behaviour.
953
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000954 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000955 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000956
957 *** If you need to access the Unicode object as UTF-8 bytes string,
958 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000959*/
960
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000961#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000964 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000966#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000967
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000968/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000969 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
972 in the unicodeobject.
973
974 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
975 support the previous internal function with the same behaviour.
976
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000977 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000978 extracted from the returned data.
979
980 *** This API is for interpreter INTERNAL USE ONLY and will likely
981 *** be removed or changed for Python 3.1.
982
983 *** If you need to access the Unicode object as UTF-8 bytes string,
984 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000985
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000986*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000987
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000988#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
990#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000991#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000992
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000993/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000994
Mark Hammond91a681d2002-08-12 07:21:58 +0000995PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000996
Guido van Rossumd8225182000-03-10 22:33:05 +0000997/* --- Generic Codecs ----------------------------------------------------- */
998
999/* Create a Unicode object by decoding the encoded string s of the
1000 given size. */
1001
Mark Hammond91a681d2002-08-12 07:21:58 +00001002PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001003 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001005 const char *encoding, /* encoding */
1006 const char *errors /* error handling */
1007 );
1008
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001009/* Decode a Unicode object unicode and return the result as Python
1010 object. */
1011
1012PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001013 PyObject *unicode, /* Unicode object */
1014 const char *encoding, /* encoding */
1015 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001016 );
1017
1018/* Decode a Unicode object unicode and return the result as Unicode
1019 object. */
1020
1021PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001022 PyObject *unicode, /* Unicode object */
1023 const char *encoding, /* encoding */
1024 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001025 );
1026
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001027/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001028 Python string object. */
1029
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001030#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001031PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001032 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001033 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001034 const char *encoding, /* encoding */
1035 const char *errors /* error handling */
1036 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001037#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001038
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001039/* Encodes a Unicode object and returns the result as Python
1040 object. */
1041
1042PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 PyObject *unicode, /* Unicode object */
1044 const char *encoding, /* encoding */
1045 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001046 );
1047
Guido van Rossumd8225182000-03-10 22:33:05 +00001048/* Encodes a Unicode object and returns the result as Python string
1049 object. */
1050
Mark Hammond91a681d2002-08-12 07:21:58 +00001051PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001052 PyObject *unicode, /* Unicode object */
1053 const char *encoding, /* encoding */
1054 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001055 );
1056
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001057/* Encodes a Unicode object and returns the result as Unicode
1058 object. */
1059
1060PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 PyObject *unicode, /* Unicode object */
1062 const char *encoding, /* encoding */
1063 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001064 );
1065
1066/* Build an encoding map. */
1067
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001068PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1069 PyObject* string /* 256 character map */
1070 );
1071
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072/* --- UTF-7 Codecs ------------------------------------------------------- */
1073
Mark Hammond91a681d2002-08-12 07:21:58 +00001074PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001075 const char *string, /* UTF-7 encoded string */
1076 Py_ssize_t length, /* size of string */
1077 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 );
1079
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001080PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081 const char *string, /* UTF-7 encoded string */
1082 Py_ssize_t length, /* size of string */
1083 const char *errors, /* error handling */
1084 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001085 );
1086
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001087#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001088PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 const Py_UNICODE *data, /* Unicode char buffer */
1090 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1091 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1092 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1093 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001095PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1096 PyObject *unicode, /* Unicode object */
1097 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1098 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1099 const char *errors /* error handling */
1100 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001101#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102
Guido van Rossumd8225182000-03-10 22:33:05 +00001103/* --- UTF-8 Codecs ------------------------------------------------------- */
1104
Mark Hammond91a681d2002-08-12 07:21:58 +00001105PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 const char *string, /* UTF-8 encoded string */
1107 Py_ssize_t length, /* size of string */
1108 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001109 );
1110
Walter Dörwald69652032004-09-07 20:24:22 +00001111PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 const char *string, /* UTF-8 encoded string */
1113 Py_ssize_t length, /* size of string */
1114 const char *errors, /* error handling */
1115 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001116 );
1117
Mark Hammond91a681d2002-08-12 07:21:58 +00001118PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001120 );
1121
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001122#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1124 PyObject *unicode,
1125 const char *errors);
1126
Mark Hammond91a681d2002-08-12 07:21:58 +00001127PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 const Py_UNICODE *data, /* Unicode char buffer */
1129 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1130 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001131 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001132#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001133
Walter Dörwald41980ca2007-08-16 21:55:45 +00001134/* --- UTF-32 Codecs ------------------------------------------------------ */
1135
1136/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1137 the corresponding Unicode object.
1138
1139 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001141
1142 If byteorder is non-NULL, the decoder starts decoding using the
1143 given byte order:
1144
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 *byteorder == -1: little endian
1146 *byteorder == 0: native order
1147 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001148
1149 In native mode, the first four bytes of the stream are checked for a
1150 BOM mark. If found, the BOM mark is analysed, the byte order
1151 adjusted and the BOM skipped. In the other modes, no BOM mark
1152 interpretation is done. After completion, *byteorder is set to the
1153 current byte order at the end of input data.
1154
1155 If byteorder is NULL, the codec starts in native order mode.
1156
1157*/
1158
1159PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001160 const char *string, /* UTF-32 encoded string */
1161 Py_ssize_t length, /* size of string */
1162 const char *errors, /* error handling */
1163 int *byteorder /* pointer to byteorder to use
1164 0=native;-1=LE,1=BE; updated on
1165 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001166 );
1167
1168PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001169 const char *string, /* UTF-32 encoded string */
1170 Py_ssize_t length, /* size of string */
1171 const char *errors, /* error handling */
1172 int *byteorder, /* pointer to byteorder to use
1173 0=native;-1=LE,1=BE; updated on
1174 exit */
1175 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001176 );
1177
1178/* Returns a Python string using the UTF-32 encoding in native byte
1179 order. The string always starts with a BOM mark. */
1180
1181PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001182 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001183 );
1184
1185/* Returns a Python string object holding the UTF-32 encoded value of
1186 the Unicode data.
1187
1188 If byteorder is not 0, output is written according to the following
1189 byte order:
1190
1191 byteorder == -1: little endian
1192 byteorder == 0: native byte order (writes a BOM mark)
1193 byteorder == 1: big endian
1194
1195 If byteorder is 0, the output string will always start with the
1196 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1197 prepended.
1198
1199*/
1200
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001201#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001202PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 const Py_UNICODE *data, /* Unicode char buffer */
1204 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1205 const char *errors, /* error handling */
1206 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001207 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001208PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1209 PyObject *object, /* Unicode object */
1210 const char *errors, /* error handling */
1211 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1212 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001213#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001214
Guido van Rossumd8225182000-03-10 22:33:05 +00001215/* --- UTF-16 Codecs ------------------------------------------------------ */
1216
Guido van Rossum9e896b32000-04-05 20:11:21 +00001217/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001218 the corresponding Unicode object.
1219
1220 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001221 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001222
1223 If byteorder is non-NULL, the decoder starts decoding using the
1224 given byte order:
1225
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 *byteorder == -1: little endian
1227 *byteorder == 0: native order
1228 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001229
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001230 In native mode, the first two bytes of the stream are checked for a
1231 BOM mark. If found, the BOM mark is analysed, the byte order
1232 adjusted and the BOM skipped. In the other modes, no BOM mark
1233 interpretation is done. After completion, *byteorder is set to the
1234 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001235
1236 If byteorder is NULL, the codec starts in native order mode.
1237
1238*/
1239
Mark Hammond91a681d2002-08-12 07:21:58 +00001240PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001241 const char *string, /* UTF-16 encoded string */
1242 Py_ssize_t length, /* size of string */
1243 const char *errors, /* error handling */
1244 int *byteorder /* pointer to byteorder to use
1245 0=native;-1=LE,1=BE; updated on
1246 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001247 );
1248
Walter Dörwald69652032004-09-07 20:24:22 +00001249PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001250 const char *string, /* UTF-16 encoded string */
1251 Py_ssize_t length, /* size of string */
1252 const char *errors, /* error handling */
1253 int *byteorder, /* pointer to byteorder to use
1254 0=native;-1=LE,1=BE; updated on
1255 exit */
1256 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001257 );
1258
Guido van Rossumd8225182000-03-10 22:33:05 +00001259/* Returns a Python string using the UTF-16 encoding in native byte
1260 order. The string always starts with a BOM mark. */
1261
Mark Hammond91a681d2002-08-12 07:21:58 +00001262PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001264 );
1265
1266/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001267 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001268
1269 If byteorder is not 0, output is written according to the following
1270 byte order:
1271
1272 byteorder == -1: little endian
1273 byteorder == 0: native byte order (writes a BOM mark)
1274 byteorder == 1: big endian
1275
1276 If byteorder is 0, the output string will always start with the
1277 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1278 prepended.
1279
1280 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1281 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001282 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001283
1284*/
1285
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001286#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001287PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 const Py_UNICODE *data, /* Unicode char buffer */
1289 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1290 const char *errors, /* error handling */
1291 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001292 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001293PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1294 PyObject* unicode, /* Unicode object */
1295 const char *errors, /* error handling */
1296 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1297 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001298#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001299
1300/* --- Unicode-Escape Codecs ---------------------------------------------- */
1301
Mark Hammond91a681d2002-08-12 07:21:58 +00001302PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 const char *string, /* Unicode-Escape encoded string */
1304 Py_ssize_t length, /* size of string */
1305 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001306 );
1307
Mark Hammond91a681d2002-08-12 07:21:58 +00001308PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001310 );
1311
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001312#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001313PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 const Py_UNICODE *data, /* Unicode char buffer */
1315 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001316 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001317#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001318
1319/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1320
Mark Hammond91a681d2002-08-12 07:21:58 +00001321PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 const char *string, /* Raw-Unicode-Escape encoded string */
1323 Py_ssize_t length, /* size of string */
1324 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001325 );
1326
Mark Hammond91a681d2002-08-12 07:21:58 +00001327PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001329 );
1330
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001331#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001332PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 const Py_UNICODE *data, /* Unicode char buffer */
1334 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001335 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001336#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001337
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001338/* --- Unicode Internal Codec ---------------------------------------------
1339
1340 Only for internal use in _codecsmodule.c */
1341
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001343PyObject *_PyUnicode_DecodeUnicodeInternal(
1344 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001345 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001346 const char *errors
1347 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001348#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001349
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001351
1352 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1353
1354*/
1355
Mark Hammond91a681d2002-08-12 07:21:58 +00001356PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 const char *string, /* Latin-1 encoded string */
1358 Py_ssize_t length, /* size of string */
1359 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001360 );
1361
Mark Hammond91a681d2002-08-12 07:21:58 +00001362PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001363 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001364 );
1365
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001366#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1368 PyObject* unicode,
1369 const char* errors);
1370
Mark Hammond91a681d2002-08-12 07:21:58 +00001371PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372 const Py_UNICODE *data, /* Unicode char buffer */
1373 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1374 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001375 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001376#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001377
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001378/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001379
1380 Only 7-bit ASCII data is excepted. All other codes generate errors.
1381
1382*/
1383
Mark Hammond91a681d2002-08-12 07:21:58 +00001384PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 const char *string, /* ASCII encoded string */
1386 Py_ssize_t length, /* size of string */
1387 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001388 );
1389
Mark Hammond91a681d2002-08-12 07:21:58 +00001390PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001392 );
1393
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001394#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1396 PyObject* unicode,
1397 const char* errors);
1398
Mark Hammond91a681d2002-08-12 07:21:58 +00001399PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 const Py_UNICODE *data, /* Unicode char buffer */
1401 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1402 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001403 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001404#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001405
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001406/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001409
1410 Decoding mappings must map single string characters to single
1411 Unicode characters, integers (which are then interpreted as Unicode
1412 ordinals) or None (meaning "undefined mapping" and causing an
1413 error).
1414
1415 Encoding mappings must map single Unicode characters to single
1416 string characters, integers (which are then interpreted as Latin-1
1417 ordinals) or None (meaning "undefined mapping" and causing an
1418 error).
1419
1420 If a character lookup fails with a LookupError, the character is
1421 copied as-is meaning that its ordinal value will be interpreted as
1422 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1423 to contain those mappings which map characters to different code
1424 points.
1425
1426*/
1427
Mark Hammond91a681d2002-08-12 07:21:58 +00001428PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001429 const char *string, /* Encoded string */
1430 Py_ssize_t length, /* size of string */
1431 PyObject *mapping, /* character mapping
1432 (char ordinal -> unicode ordinal) */
1433 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001434 );
1435
Mark Hammond91a681d2002-08-12 07:21:58 +00001436PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 PyObject *unicode, /* Unicode object */
1438 PyObject *mapping /* character mapping
1439 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001440 );
1441
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001442#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001443PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 const Py_UNICODE *data, /* Unicode char buffer */
1445 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1446 PyObject *mapping, /* character mapping
1447 (unicode ordinal -> char ordinal) */
1448 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001449 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001450PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1451 PyObject *unicode, /* Unicode object */
1452 PyObject *mapping, /* character mapping
1453 (unicode ordinal -> char ordinal) */
1454 const char *errors /* error handling */
1455 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001456#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001457
1458/* Translate a Py_UNICODE buffer of the given length by applying a
1459 character mapping table to it and return the resulting Unicode
1460 object.
1461
1462 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001463 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001464
1465 Mapping tables may be dictionaries or sequences. Unmapped character
1466 ordinals (ones which cause a LookupError) are left untouched and
1467 are copied as-is.
1468
1469*/
1470
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001471#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001472PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 const Py_UNICODE *data, /* Unicode char buffer */
1474 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1475 PyObject *table, /* Translate table */
1476 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001477 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001478#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001479
Victor Stinner99b95382011-07-04 14:23:54 +02001480#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001481
Guido van Rossumefec1152000-03-28 02:01:15 +00001482/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001483
Mark Hammond91a681d2002-08-12 07:21:58 +00001484PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001485 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001486 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001487 const char *errors /* error handling */
1488 );
1489
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001490PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1491 const char *string, /* MBCS encoded string */
1492 Py_ssize_t length, /* size of string */
1493 const char *errors, /* error handling */
1494 Py_ssize_t *consumed /* bytes consumed */
1495 );
1496
Victor Stinner3a50e702011-10-18 21:21:00 +02001497PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1498 int code_page, /* code page number */
1499 const char *string, /* encoded string */
1500 Py_ssize_t length, /* size of string */
1501 const char *errors, /* error handling */
1502 Py_ssize_t *consumed /* bytes consumed */
1503 );
1504
Mark Hammond91a681d2002-08-12 07:21:58 +00001505PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001506 PyObject *unicode /* Unicode object */
1507 );
1508
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001509#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001510PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001511 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001512 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001513 const char *errors /* error handling */
1514 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001515#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001516
Victor Stinner3a50e702011-10-18 21:21:00 +02001517PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1518 int code_page, /* code page number */
1519 PyObject *unicode, /* Unicode object */
1520 const char *errors /* error handling */
1521 );
1522
Victor Stinner99b95382011-07-04 14:23:54 +02001523#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001524
Guido van Rossum9e896b32000-04-05 20:11:21 +00001525/* --- Decimal Encoder ---------------------------------------------------- */
1526
1527/* Takes a Unicode string holding a decimal value and writes it into
1528 an output buffer using standard ASCII digit codes.
1529
1530 The output buffer has to provide at least length+1 bytes of storage
1531 area. The output string is 0-terminated.
1532
1533 The encoder converts whitespace to ' ', decimal characters to their
1534 corresponding ASCII digit and all other Latin-1 characters except
1535 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1536 are treated as errors. This includes embedded NULL bytes.
1537
1538 Error handling is defined by the errors argument:
1539
1540 NULL or "strict": raise a ValueError
1541 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001543 "replace": replaces illegal characters with '?'
1544
1545 Returns 0 on success, -1 on failure.
1546
1547*/
1548
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001549#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001550PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 Py_UNICODE *s, /* Unicode buffer */
1552 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1553 char *output, /* Output buffer; must have size >= length */
1554 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001555 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001556#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001557
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001558/* Transforms code points that have decimal digit property to the
1559 corresponding ASCII digit code points.
1560
1561 Returns a new Unicode string on success, NULL on failure.
1562*/
1563
Georg Brandlb5503082010-12-05 11:40:48 +00001564#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001565PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1566 Py_UNICODE *s, /* Unicode buffer */
1567 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1568 );
Georg Brandlb5503082010-12-05 11:40:48 +00001569#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001570
Victor Stinner6f9568b2011-11-17 00:12:44 +01001571/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 as argument instead of a raw buffer and length. This function additionally
1573 transforms spaces to ASCII because this is what the callers in longobject,
1574 floatobject, and complexobject did anyways. */
1575
1576#ifndef Py_LIMITED_API
1577PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1578 PyObject *unicode /* Unicode object */
1579 );
1580#endif
1581
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582/* --- File system encoding ---------------------------------------------- */
1583
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001584/* ParseTuple converter: encode str objects to bytes using
1585 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001586
1587PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1588
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001589/* ParseTuple converter: decode bytes objects to unicode using
1590 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1591
1592PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1593
Victor Stinner77c38622010-05-14 15:58:55 +00001594/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1595 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001597 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1598 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001599
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001600 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601*/
1602
1603PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1604 const char *s /* encoded string */
1605 );
1606
Victor Stinner77c38622010-05-14 15:58:55 +00001607/* Decode a string using Py_FileSystemDefaultEncoding
1608 and the "surrogateescape" error handler.
1609
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001610 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1611 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001612*/
1613
Martin v. Löwis011e8422009-05-05 04:43:17 +00001614PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1615 const char *s, /* encoded string */
1616 Py_ssize_t size /* size */
1617 );
1618
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001620 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001621
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001622 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1623 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001624*/
1625
1626PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1627 PyObject *unicode
1628 );
1629
Guido van Rossumd8225182000-03-10 22:33:05 +00001630/* --- Methods & Slots ----------------------------------------------------
1631
1632 These are capable of handling Unicode objects and strings on input
1633 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001634 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001635
1636/* Concat two strings giving a new Unicode string. */
1637
Mark Hammond91a681d2002-08-12 07:21:58 +00001638PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001639 PyObject *left, /* Left string */
1640 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001641 );
1642
Walter Dörwald1ab83302007-05-18 17:15:44 +00001643/* Concat two strings and put the result in *pleft
1644 (sets *pleft to NULL on error) */
1645
1646PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 PyObject **pleft, /* Pointer to left string */
1648 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001649 );
1650
1651/* Concat two strings, put the result in *pleft and drop the right object
1652 (sets *pleft to NULL on error) */
1653
1654PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001655 PyObject **pleft, /* Pointer to left string */
1656 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001657 );
1658
Guido van Rossumd8225182000-03-10 22:33:05 +00001659/* Split a string giving a list of Unicode strings.
1660
1661 If sep is NULL, splitting will be done at all whitespace
1662 substrings. Otherwise, splits occur at the given separator.
1663
1664 At most maxsplit splits will be done. If negative, no limit is set.
1665
1666 Separators are not included in the resulting list.
1667
1668*/
1669
Mark Hammond91a681d2002-08-12 07:21:58 +00001670PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 PyObject *s, /* String to split */
1672 PyObject *sep, /* String separator */
1673 Py_ssize_t maxsplit /* Maxsplit count */
1674 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001675
1676/* Dito, but split at line breaks.
1677
1678 CRLF is considered to be one line break. Line breaks are not
1679 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001680
Mark Hammond91a681d2002-08-12 07:21:58 +00001681PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 PyObject *s, /* String to split */
1683 int keepends /* If true, line end markers are included */
1684 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001685
Thomas Wouters477c8d52006-05-27 19:21:47 +00001686/* Partition a string using a given separator. */
1687
1688PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001689 PyObject *s, /* String to partition */
1690 PyObject *sep /* String separator */
1691 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001692
1693/* Partition a string using a given separator, searching from the end of the
1694 string. */
1695
1696PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 PyObject *s, /* String to partition */
1698 PyObject *sep /* String separator */
1699 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001700
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001701/* Split a string giving a list of Unicode strings.
1702
1703 If sep is NULL, splitting will be done at all whitespace
1704 substrings. Otherwise, splits occur at the given separator.
1705
1706 At most maxsplit splits will be done. But unlike PyUnicode_Split
1707 PyUnicode_RSplit splits from the end of the string. If negative,
1708 no limit is set.
1709
1710 Separators are not included in the resulting list.
1711
1712*/
1713
1714PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 PyObject *s, /* String to split */
1716 PyObject *sep, /* String separator */
1717 Py_ssize_t maxsplit /* Maxsplit count */
1718 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001719
Guido van Rossumd8225182000-03-10 22:33:05 +00001720/* Translate a string by applying a character mapping table to it and
1721 return the resulting Unicode object.
1722
1723 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001724 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001725
1726 Mapping tables may be dictionaries or sequences. Unmapped character
1727 ordinals (ones which cause a LookupError) are left untouched and
1728 are copied as-is.
1729
1730*/
1731
Mark Hammond91a681d2002-08-12 07:21:58 +00001732PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001733 PyObject *str, /* String */
1734 PyObject *table, /* Translate table */
1735 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001736 );
1737
1738/* Join a sequence of strings using the given separator and return
1739 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740
Mark Hammond91a681d2002-08-12 07:21:58 +00001741PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 PyObject *separator, /* Separator string */
1743 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001744 );
1745
1746/* Return 1 if substr matches str[start:end] at the given tail end, 0
1747 otherwise. */
1748
Martin v. Löwis18e16552006-02-15 17:27:45 +00001749PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750 PyObject *str, /* String */
1751 PyObject *substr, /* Prefix or Suffix string */
1752 Py_ssize_t start, /* Start index */
1753 Py_ssize_t end, /* Stop index */
1754 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001755 );
1756
1757/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001758 given search direction or -1 if not found. -2 is returned in case
1759 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001760
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001762 PyObject *str, /* String */
1763 PyObject *substr, /* Substring to find */
1764 Py_ssize_t start, /* Start index */
1765 Py_ssize_t end, /* Stop index */
1766 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001767 );
1768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769/* Like PyUnicode_Find, but search for single character only. */
1770PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1771 PyObject *str,
1772 Py_UCS4 ch,
1773 Py_ssize_t start,
1774 Py_ssize_t end,
1775 int direction
1776 );
1777
Barry Warsaw51ac5802000-03-20 16:36:48 +00001778/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001779
Martin v. Löwis18e16552006-02-15 17:27:45 +00001780PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001781 PyObject *str, /* String */
1782 PyObject *substr, /* Substring to count */
1783 Py_ssize_t start, /* Start index */
1784 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001785 );
1786
Barry Warsaw51ac5802000-03-20 16:36:48 +00001787/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001788 and return the resulting Unicode object. */
1789
Mark Hammond91a681d2002-08-12 07:21:58 +00001790PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001791 PyObject *str, /* String */
1792 PyObject *substr, /* Substring to find */
1793 PyObject *replstr, /* Substring to replace */
1794 Py_ssize_t maxcount /* Max. number of replacements to apply;
1795 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001796 );
1797
1798/* Compare two strings and return -1, 0, 1 for less than, equal,
1799 greater than resp. */
1800
Mark Hammond91a681d2002-08-12 07:21:58 +00001801PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001802 PyObject *left, /* Left string */
1803 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001804 );
1805
Martin v. Löwis5b222132007-06-10 09:51:05 +00001806PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1807 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001808 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001809 );
1810
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001811/* Rich compare two strings and return one of the following:
1812
1813 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001814 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001815 - Py_NotImplemented in case the type combination is unknown
1816
1817 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1818 case the conversion of the arguments to Unicode fails with a
1819 UnicodeDecodeError.
1820
1821 Possible values for op:
1822
1823 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1824
1825*/
1826
1827PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 PyObject *left, /* Left string */
1829 PyObject *right, /* Right string */
1830 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001831 );
1832
Thomas Wouters7e474022000-07-16 12:04:32 +00001833/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001834 the resulting Unicode string. */
1835
Mark Hammond91a681d2002-08-12 07:21:58 +00001836PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001837 PyObject *format, /* Format string */
1838 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001839 );
1840
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001841/* Checks whether element is contained in container and return 1/0
1842 accordingly.
1843
1844 element has to coerce to an one element Unicode string. -1 is
1845 returned in case of an error. */
1846
Mark Hammond91a681d2002-08-12 07:21:58 +00001847PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001848 PyObject *container, /* Container string */
1849 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001850 );
1851
Martin v. Löwis47383402007-08-15 07:32:56 +00001852/* Checks whether argument is a valid identifier. */
1853
1854PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1855
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001856#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001857/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001858PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001859 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001860 int striptype,
1861 PyObject *sepobj
1862 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001863#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001864
Eric Smith5807c412008-05-11 21:00:57 +00001865/* Using the current locale, insert the thousands grouping
1866 into the string pointed to by buffer. For the argument descriptions,
1867 see Objects/stringlib/localeutil.h */
1868
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001869#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001870PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1871 Py_ssize_t n_buffer,
1872 Py_UNICODE *digits,
1873 Py_ssize_t n_digits,
1874 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001875#endif
Eric Smith5807c412008-05-11 21:00:57 +00001876
Eric Smitha3b1ac82009-04-03 14:45:06 +00001877/* Using explicit passed-in values, insert the thousands grouping
1878 into the string pointed to by buffer. For the argument descriptions,
1879 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001880#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001882 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 int kind,
1884 void *buffer,
1885 Py_ssize_t n_buffer,
1886 void *digits,
1887 Py_ssize_t n_digits,
1888 Py_ssize_t min_width,
1889 const char *grouping,
1890 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001891#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001892/* === Characters Type APIs =============================================== */
1893
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001894/* Helper array used by Py_UNICODE_ISSPACE(). */
1895
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001896#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001897PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1898
Guido van Rossumd8225182000-03-10 22:33:05 +00001899/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001900 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001901
1902 These APIs are implemented in Objects/unicodectype.c.
1903
1904*/
1905
Mark Hammond91a681d2002-08-12 07:21:58 +00001906PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001907 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001908 );
1909
Mark Hammond91a681d2002-08-12 07:21:58 +00001910PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001911 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001912 );
1913
Mark Hammond91a681d2002-08-12 07:21:58 +00001914PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001915 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001916 );
1917
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001918PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001919 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001920 );
1921
1922PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001923 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001924 );
1925
Mark Hammond91a681d2002-08-12 07:21:58 +00001926PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001928 );
1929
Mark Hammond91a681d2002-08-12 07:21:58 +00001930PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001931 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001934PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1935 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001938PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1939 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001942PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1943 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
Mark Hammond91a681d2002-08-12 07:21:58 +00001946PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001947 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001948 );
1949
Mark Hammond91a681d2002-08-12 07:21:58 +00001950PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001951 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001952 );
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001956 );
1957
Mark Hammond91a681d2002-08-12 07:21:58 +00001958PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001959 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960 );
1961
Mark Hammond91a681d2002-08-12 07:21:58 +00001962PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001963 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964 );
1965
Mark Hammond91a681d2002-08-12 07:21:58 +00001966PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001967 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001968 );
1969
Georg Brandl559e5d72008-06-11 18:37:52 +00001970PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001971 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001972 );
1973
Mark Hammond91a681d2002-08-12 07:21:58 +00001974PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001975 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001976 );
1977
Victor Stinneref8d95c2010-08-16 22:03:11 +00001978PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1979 const Py_UNICODE *u
1980 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001981
1982PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001983 Py_UNICODE *s1,
1984 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001985
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001986PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1987 Py_UNICODE *s1, const Py_UNICODE *s2);
1988
Martin v. Löwis5b222132007-06-10 09:51:05 +00001989PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001990 Py_UNICODE *s1,
1991 const Py_UNICODE *s2,
1992 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001993
1994PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001995 const Py_UNICODE *s1,
1996 const Py_UNICODE *s2
1997 );
1998
1999PyAPI_FUNC(int) Py_UNICODE_strncmp(
2000 const Py_UNICODE *s1,
2001 const Py_UNICODE *s2,
2002 size_t n
2003 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002004
2005PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002006 const Py_UNICODE *s,
2007 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002008 );
2009
Victor Stinner331ea922010-08-10 16:37:20 +00002010PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002011 const Py_UNICODE *s,
2012 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002013 );
2014
Victor Stinner71133ff2010-09-01 23:43:53 +00002015/* Create a copy of a unicode string ending with a nul character. Return NULL
2016 and raise a MemoryError exception on memory allocation failure, otherwise
2017 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2018
Victor Stinner46408602010-09-03 16:18:00 +00002019PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002020 PyObject *unicode
2021 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002022#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002023
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002024#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002025PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002026 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002027 int check_content);
2028#endif
2029
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002030/********************* String Literals ****************************************/
2031/* This structure helps managing static strings. The basic usage goes like this:
2032 Instead of doing
2033
2034 r = PyObject_CallMethod(o, "foo", "args", ...);
2035
2036 do
2037
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002038 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002039 ...
2040 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2041
2042 PyId_foo is a static variable, either on block level or file level. On first
2043 usage, the string "foo" is interned, and the structures are linked. On interpreter
2044 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2045
2046 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002047 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002048 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2049*/
2050typedef struct _Py_Identifier {
2051 struct _Py_Identifier *next;
2052 const char* string;
2053 PyObject *object;
2054} _Py_Identifier;
2055
Martin v. Löwis87da8722011-10-09 11:54:42 +02002056#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002057#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002058
2059/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2060PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2061/* Clear all static strings. */
2062PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2063
Guido van Rossumd8225182000-03-10 22:33:05 +00002064#ifdef __cplusplus
2065}
2066#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002067#endif /* !Py_UNICODEOBJECT_H */