blob: 9308a6aa96af9009290ae22dfc9f144e80579767 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Martin Panter6d57fe12016-09-17 03:26:16 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
Victor Stinner76df43d2012-10-30 01:42:39 +0100183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
Victor Stinner551ac952011-11-29 22:58:13 +0100192/* low surrogate = bottom 10 bits added to DC00 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100223 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200224 * kind = PyUnicode_1BYTE_KIND
225 * compact = 1
226 * ascii = 1
227 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200228 * (length is the length of the utf8 and wstr strings)
229 * (data starts just after the structure)
230 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200231
232 - compact:
233
234 * structure = PyCompactUnicodeObject
Victor Stinner80bc72d2011-12-22 03:23:10 +0100235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237 PyUnicode_4BYTE_KIND
238 * compact = 1
239 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200242 * utf8_length = 0 if utf8 is NULL
243 * wstr is shared with data and wstr_length=length
244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200246 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200247 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100253 * length = 0 (use wstr_length)
254 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * kind = PyUnicode_WCHAR_KIND
256 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200257 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200258 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100259 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200260 * wstr is not NULL
261 * data.any is NULL
262 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200263 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200266
267 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270 PyUnicode_4BYTE_KIND
271 * compact = 0
272 * ready = 1
273 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200274 * utf8 is shared and utf8_length = length with data.any if ascii = 1
275 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100276 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Compact strings use only one memory block (structure + characters),
282 whereas legacy strings use one block for the structure and one block
283 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200284
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200285 Legacy strings are created by PyUnicode_FromUnicode() and
286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287 when PyUnicode_READY() is called.
288
289 See also _PyUnicode_CheckConsistency().
290 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000293 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 struct {
295 /*
296 SSTATE_NOT_INTERNED (0)
297 SSTATE_INTERNED_MORTAL (1)
298 SSTATE_INTERNED_IMMORTAL (2)
299
300 If interned != SSTATE_NOT_INTERNED, the two references from the
301 dictionary to this object are *not* counted in ob_refcnt.
302 */
303 unsigned int interned:2;
304 /* Character size:
305
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200306 - PyUnicode_WCHAR_KIND (0):
307
308 * character type = wchar_t (16 or 32 bits, depending on the
309 platform)
310
311 - PyUnicode_1BYTE_KIND (1):
312
313 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+00FF (latin1)
315 * if ascii is set, all characters are in the range U+0000-U+007F
316 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200317 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 - PyUnicode_2BYTE_KIND (2):
320
321 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100322 * all characters are in the range U+0000-U+FFFF (BMP)
323 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200324
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200325 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200326
327 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100328 * all characters are in the range U+0000-U+10FFFF
329 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200331 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 /* Compact is with respect to the allocation scheme. Compact unicode
333 objects only require one memory block while non-compact objects use
334 one block for the PyUnicodeObject struct and another for its data
335 buffer. */
336 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100337 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 unsigned int ascii:1;
341 /* The ready flag indicates whether the object layout is initialized
342 completely. This means that this is either a compact object, or
343 the data pointer is filled out. The bit is redundant, and helps
344 to minimize the test in PyUnicode_IS_READY(). */
345 unsigned int ready:1;
Antoine Pitrou8c6f8dc2014-03-23 22:55:03 +0100346 /* Padding to ensure that PyUnicode_DATA() is always aligned to
347 4 bytes (see issue #19537 on m68k). */
348 unsigned int :24;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200349 } state;
350 wchar_t *wstr; /* wchar_t representation (null-terminated) */
351} PyASCIIObject;
352
353/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200354 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 immediately follow the structure. */
356typedef struct {
357 PyASCIIObject _base;
358 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
359 * terminating \0. */
360 char *utf8; /* UTF-8 representation (null-terminated) */
361 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
362 * surrogates count as two code points. */
363} PyCompactUnicodeObject;
364
365/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
366 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200367 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368typedef struct {
369 PyCompactUnicodeObject _base;
370 union {
371 void *any;
372 Py_UCS1 *latin1;
373 Py_UCS2 *ucs2;
374 Py_UCS4 *ucs4;
375 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000376} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000377#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
Mark Hammond91a681d2002-08-12 07:21:58 +0000379PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000380PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000381
Thomas Wouters27d517b2007-02-25 20:39:11 +0000382#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000383 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
384#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000385
386/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000387#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388
389#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200390 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200391 ((PyASCIIObject*)op)->length : \
392 ((PyCompactUnicodeObject*)op)->wstr_length)
393
394/* Returns the deprecated Py_UNICODE representation's size in code units
395 (this includes surrogate pairs as 2 units).
396 If the Py_UNICODE representation is not available, it will be computed
397 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
398
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100399#define PyUnicode_GET_SIZE(op) \
400 (assert(PyUnicode_Check(op)), \
401 (((PyASCIIObject *)(op))->wstr) ? \
402 PyUnicode_WSTR_LENGTH(op) : \
403 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
404 assert(((PyASCIIObject *)(op))->wstr), \
405 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406
Guido van Rossumd8225182000-03-10 22:33:05 +0000407#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
409
410/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
411 representation on demand. Using this macro is very inefficient now,
412 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
413 use PyUnicode_WRITE() and PyUnicode_READ(). */
414
Guido van Rossumd8225182000-03-10 22:33:05 +0000415#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200416 (assert(PyUnicode_Check(op)), \
417 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
418 PyUnicode_AsUnicode((PyObject *)(op)))
419
Guido van Rossumd8225182000-03-10 22:33:05 +0000420#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200421 ((const char *)(PyUnicode_AS_UNICODE(op)))
422
423
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200424/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200425
Victor Stinner6f9568b2011-11-17 00:12:44 +0100426/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
428/* Interning state. */
429#define SSTATE_NOT_INTERNED 0
430#define SSTATE_INTERNED_MORTAL 1
431#define SSTATE_INTERNED_IMMORTAL 2
432
Victor Stinnera3b334d2011-10-03 13:53:37 +0200433/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100434 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
435 ready. */
436#define PyUnicode_IS_ASCII(op) \
437 (assert(PyUnicode_Check(op)), \
438 assert(PyUnicode_IS_READY(op)), \
439 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200440
441/* Return true if the string is compact or 0 if not.
442 No type checks or Ready calls are performed. */
443#define PyUnicode_IS_COMPACT(op) \
444 (((PyASCIIObject*)(op))->state.compact)
445
446/* Return true if the string is a compact ASCII string (use PyASCIIObject
447 structure), or 0 if not. No type checks or Ready calls are performed. */
448#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100449 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450
Victor Stinner52e2cc82011-12-19 22:14:45 +0100451enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 has not been called yet. */
Victor Stinner52e2cc82011-12-19 22:14:45 +0100455 PyUnicode_WCHAR_KIND = 0,
456/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200457 PyUnicode_1BYTE_KIND = 1,
458 PyUnicode_2BYTE_KIND = 2,
459 PyUnicode_4BYTE_KIND = 4
460};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461
Georg Brandl4975a9b2011-10-05 16:12:21 +0200462/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200464 No checks are performed, use PyUnicode_KIND() before to ensure
465 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466
467#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
468#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
469#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
470
Victor Stinner157f83f2011-09-28 21:41:31 +0200471/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200472#define PyUnicode_KIND(op) \
473 (assert(PyUnicode_Check(op)), \
474 assert(PyUnicode_IS_READY(op)), \
475 ((PyASCIIObject *)(op))->state.kind)
476
Victor Stinner157f83f2011-09-28 21:41:31 +0200477/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200479 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480 ((void*)((PyASCIIObject*)(op) + 1)) : \
481 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
482
483#define _PyUnicode_NONCOMPACT_DATA(op) \
484 (assert(((PyUnicodeObject*)(op))->data.any), \
485 ((((PyUnicodeObject *)(op))->data.any)))
486
487#define PyUnicode_DATA(op) \
488 (assert(PyUnicode_Check(op)), \
489 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
490 _PyUnicode_NONCOMPACT_DATA(op))
491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492/* In the access macros below, "kind" may be evaluated more than once.
493 All other macro parameters are evaluated exactly once, so it is safe
494 to put side effects into them (such as increasing the index). */
495
496/* Write into the canonical representation, this macro does not do any sanity
497 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200498 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200500 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200501#define PyUnicode_WRITE(kind, data, index, value) \
502 do { \
503 switch ((kind)) { \
504 case PyUnicode_1BYTE_KIND: { \
505 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
506 break; \
507 } \
508 case PyUnicode_2BYTE_KIND: { \
509 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
510 break; \
511 } \
512 default: { \
513 assert((kind) == PyUnicode_4BYTE_KIND); \
514 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
515 } \
516 } \
517 } while (0)
518
Georg Brandl07de3252011-10-05 16:47:38 +0200519/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200520 or ready calls are performed. */
521#define PyUnicode_READ(kind, data, index) \
522 ((Py_UCS4) \
523 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200524 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200525 ((kind) == PyUnicode_2BYTE_KIND ? \
526 ((const Py_UCS2 *)(data))[(index)] : \
527 ((const Py_UCS4 *)(data))[(index)] \
528 ) \
529 ))
530
531/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
532 calls PyUnicode_KIND() and might call it twice. For single reads, use
533 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
534 cache kind and use PyUnicode_READ instead. */
535#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200536 (assert(PyUnicode_Check(unicode)), \
537 assert(PyUnicode_IS_READY(unicode)), \
538 (Py_UCS4) \
539 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
540 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
541 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
542 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
543 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
544 ) \
545 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546
547/* Returns the length of the unicode string. The caller has to make sure that
548 the string has it's canonical representation set before calling
549 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
550#define PyUnicode_GET_LENGTH(op) \
551 (assert(PyUnicode_Check(op)), \
552 assert(PyUnicode_IS_READY(op)), \
553 ((PyASCIIObject *)(op))->length)
554
555
556/* Fast check to determine whether an object is ready. Equivalent to
557 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
558
559#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
560
Victor Stinnera3b334d2011-10-03 13:53:37 +0200561/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200563 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 Returns 0 on success and -1 on errors. */
565#define PyUnicode_READY(op) \
566 (assert(PyUnicode_Check(op)), \
567 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200568 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570/* Return a maximum character value which is suitable for creating another
571 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200572 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573#define PyUnicode_MAX_CHAR_VALUE(op) \
574 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200575 (PyUnicode_IS_ASCII(op) ? \
576 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200578 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200580 (0xffffU) : \
581 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000583#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000584
585/* --- Constants ---------------------------------------------------------- */
586
587/* This Unicode character will be used as replacement character during
588 decoding if the errors argument is set to "replace". Note: the
589 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
590 Unicode 3.0. */
591
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200592#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000593
594/* === Public API ========================================================= */
595
596/* --- Plain Py_UNICODE --------------------------------------------------- */
597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598/* With PEP 393, this is the recommended way to allocate a new unicode object.
599 This function will allocate the object and its buffer in a single memory
600 block. Objects created using this function are not resizable. */
601#ifndef Py_LIMITED_API
602PyAPI_FUNC(PyObject*) PyUnicode_New(
603 Py_ssize_t size, /* Number of code points in the new string */
604 Py_UCS4 maxchar /* maximum code point value in the string */
605 );
606#endif
607
Benjamin Peterson82f34ad2015-01-13 09:17:24 -0500608/* Initializes the canonical string representation from the deprecated
Victor Stinnerd8f65102011-09-29 19:43:17 +0200609 wstr/Py_UNICODE representation. This function is used to convert Unicode
610 objects which were created using the old API to the new flexible format
611 introduced with PEP 393.
612
613 Don't call this function directly, use the public PyUnicode_READY() macro
614 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifndef Py_LIMITED_API
616PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200617 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618 );
619#endif
620
Victor Stinner034f6cf2011-09-30 02:26:44 +0200621/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100622#ifndef Py_LIMITED_API
623PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200624 PyObject *unicode
625 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100626#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628/* Copy character from one unicode object into another, this function performs
Victor Stinner3fe55312012-01-04 00:33:50 +0100629 character conversion when necessary and falls back to memcpy() if possible.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200630
Victor Stinner3fe55312012-01-04 00:33:50 +0100631 Fail if to is too small (smaller than *how_many* or smaller than
Victor Stinnera0702ab2011-09-29 14:14:38 +0200632 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
Victor Stinner3fe55312012-01-04 00:33:50 +0100633 kind(to), or if *to* has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200634
635 Return the number of written character, or return -1 and raise an exception
636 on error.
637
638 Pseudo-code:
639
640 how_many = min(how_many, len(from) - from_start)
641 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
642 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200643
644 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200645 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 PyObject *to,
649 Py_ssize_t to_start,
650 PyObject *from,
651 Py_ssize_t from_start,
652 Py_ssize_t how_many
653 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200654
655/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
656 may crash if parameters are invalid (e.g. if the output string
657 is too short). */
658PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
659 PyObject *to,
660 Py_ssize_t to_start,
661 PyObject *from,
662 Py_ssize_t from_start,
663 Py_ssize_t how_many
664 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665#endif
666
Victor Stinnerd3f08822012-05-29 12:57:52 +0200667#ifndef Py_LIMITED_API
Victor Stinner3fe55312012-01-04 00:33:50 +0100668/* Fill a string with a character: write fill_char into
669 unicode[start:start+length].
670
671 Fail if fill_char is bigger than the string maximum character, or if the
672 string has more than 1 reference.
673
674 Return the number of written character, or return -1 and raise an exception
675 on error. */
Victor Stinner3fe55312012-01-04 00:33:50 +0100676PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
677 PyObject *unicode,
678 Py_ssize_t start,
679 Py_ssize_t length,
680 Py_UCS4 fill_char
681 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200682
683/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
684 if parameters are invalid (e.g. if length is longer than the string). */
685PyAPI_FUNC(void) _PyUnicode_FastFill(
686 PyObject *unicode,
687 Py_ssize_t start,
688 Py_ssize_t length,
689 Py_UCS4 fill_char
690 );
Victor Stinner3fe55312012-01-04 00:33:50 +0100691#endif
692
Guido van Rossumd8225182000-03-10 22:33:05 +0000693/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000695
696 u may be NULL which causes the contents to be undefined. It is the
697 user's responsibility to fill in the needed data afterwards. Note
698 that modifying the Unicode object contents after construction is
699 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000700
701 The buffer is copied into the new object. */
702
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000703#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000704PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000705 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000706 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000707 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000708#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000709
Georg Brandl952867a2010-06-27 10:17:12 +0000710/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000712 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000713 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714 );
715
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000716/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000718PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000719 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000720 );
721
Victor Stinnerd3f08822012-05-29 12:57:52 +0200722#ifndef Py_LIMITED_API
Victor Stinnerb9275c12011-10-05 14:01:42 +0200723/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
724 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
726 int kind,
727 const void *buffer,
728 Py_ssize_t size);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200729
730/* Create a new string from a buffer of ASCII characters.
731 WARNING: Don't check if the string contains any non-ASCII character. */
732PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
733 const char *buffer,
734 Py_ssize_t size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735#endif
736
737PyAPI_FUNC(PyObject*) PyUnicode_Substring(
738 PyObject *str,
739 Py_ssize_t start,
740 Py_ssize_t end);
741
Victor Stinnerece58de2012-04-23 23:36:38 +0200742#ifndef Py_LIMITED_API
743/* Compute the maximum character of the substring unicode[start:end].
744 Return 127 for an empty string. */
745PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
746 PyObject *unicode,
747 Py_ssize_t start,
748 Py_ssize_t end);
749#endif
750
Georg Brandldb6c7f52011-10-07 11:19:11 +0200751/* Copy the string into a UCS4 buffer including the null character if copy_null
Serhiy Storchakacc164232016-10-02 21:29:26 +0300752 is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753 the buffer is smaller than the string. Return buffer on success.
754
755 buflen is the length of the buffer in (Py_UCS4) characters. */
756PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
757 PyObject *unicode,
758 Py_UCS4* buffer,
759 Py_ssize_t buflen,
760 int copy_null);
761
762/* Copy the string into a UCS4 buffer. A new buffer is allocated using
763 * PyMem_Malloc; if this fails, NULL is returned with a memory error
764 exception set. */
765PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
766
Guido van Rossumd8225182000-03-10 22:33:05 +0000767/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 Py_UNICODE buffer.
769 If the wchar_t/Py_UNICODE representation is not yet available, this
770 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000771
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000772#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000773PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000774 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000775 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000776#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778/* Return a read-only pointer to the Unicode object's internal
779 Py_UNICODE buffer and save the length at size.
780 If the wchar_t/Py_UNICODE representation is not yet available, this
781 function will calculate it. */
782
783#ifndef Py_LIMITED_API
784PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
785 PyObject *unicode, /* Unicode object */
786 Py_ssize_t *size /* location where to save the length */
787 );
788#endif
789
Guido van Rossumd8225182000-03-10 22:33:05 +0000790/* Get the length of the Unicode object. */
791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
793 PyObject *unicode
794);
795
Victor Stinner157f83f2011-09-28 21:41:31 +0200796/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200797 string representation. */
798
Martin v. Löwis18e16552006-02-15 17:27:45 +0000799PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000801 );
802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803/* Read a character from the string. */
804
805PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
806 PyObject *unicode,
807 Py_ssize_t index
808 );
809
810/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200811 PyUnicode_New, must not be shared, and must not have been hashed yet.
812
813 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814
815PyAPI_FUNC(int) PyUnicode_WriteChar(
816 PyObject *unicode,
817 Py_ssize_t index,
818 Py_UCS4 character
819 );
820
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000822/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000823PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000824#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000825
Martin Panter6245cb32016-04-15 02:14:19 +0000826/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100827 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
828 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000829
830 *unicode is modified to point to the new (resized) object and 0
831 returned on success.
832
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100833 Try to resize the string in place (which is usually faster than allocating
834 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000835
836 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100837 is returned and *unicode left untouched.
838
839 WARNING: The function doesn't check string content, the result may not be a
840 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000841
Mark Hammond91a681d2002-08-12 07:21:58 +0000842PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 PyObject **unicode, /* Pointer to the Unicode object */
844 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000845 );
846
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300847/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000848
Martin Panter20d32552016-04-15 00:56:21 +0000849 bytes, bytearray and other bytes-like objects are decoded according to the
850 given encoding and error handler. The encoding and error handler can be
851 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000852
Martin Panter20d32552016-04-15 00:56:21 +0000853 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000854
855 The API returns NULL in case of an error. The caller is responsible
856 for decref'ing the returned objects.
857
858*/
859
Mark Hammond91a681d2002-08-12 07:21:58 +0000860PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200861 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000862 const char *encoding, /* encoding */
863 const char *errors /* error handling */
864 );
865
Martin Panter20d32552016-04-15 00:56:21 +0000866/* Copy an instance of a Unicode subtype to a new true Unicode object if
867 necessary. If obj is already a true Unicode object (not a subtype), return
868 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000869
870 The API returns NULL in case of an error. The caller is responsible
871 for decref'ing the returned objects.
872
873*/
874
Mark Hammond91a681d2002-08-12 07:21:58 +0000875PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000877 );
878
Victor Stinner1205f272010-09-11 00:54:47 +0000879PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
880 const char *format, /* ASCII-encoded string */
881 va_list vargs
882 );
883PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
884 const char *format, /* ASCII-encoded string */
885 ...
886 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000888#ifndef Py_LIMITED_API
Victor Stinnerd3f08822012-05-29 12:57:52 +0200889typedef struct {
890 PyObject *buffer;
891 void *data;
892 enum PyUnicode_Kind kind;
893 Py_UCS4 maxchar;
894 Py_ssize_t size;
895 Py_ssize_t pos;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200896
897 /* minimum number of allocated characters (default: 0) */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200898 Py_ssize_t min_length;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200899
900 /* minimum character (default: 127, ASCII) */
901 Py_UCS4 min_char;
902
903 /* If non-zero, overallocate the buffer by 25% (default: 0). */
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200904 unsigned char overallocate;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200905
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200906 /* If readonly is 1, buffer is a shared string (cannot be modified)
907 and size is set to 0. */
908 unsigned char readonly;
Victor Stinnerd3f08822012-05-29 12:57:52 +0200909} _PyUnicodeWriter ;
910
911/* Initialize a Unicode writer.
Victor Stinner8f674cc2013-04-17 23:02:17 +0200912 *
913 * By default, the minimum buffer size is 0 character and overallocation is
914 * disabled. Set min_length, min_char and overallocate attributes to control
915 * the allocation of the buffer. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200916PyAPI_FUNC(void)
Victor Stinner8f674cc2013-04-17 23:02:17 +0200917_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200918
919/* Prepare the buffer to write 'length' characters
920 with the specified maximum character.
921
922 Return 0 on success, raise an exception and return -1 on error. */
923#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
924 (((MAXCHAR) <= (WRITER)->maxchar \
925 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
926 ? 0 \
927 : (((LENGTH) == 0) \
928 ? 0 \
929 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
930
931/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
932 instead. */
933PyAPI_FUNC(int)
934_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
935 Py_ssize_t length, Py_UCS4 maxchar);
936
Victor Stinnera0dd0212013-04-11 22:09:04 +0200937/* Append a Unicode character.
938 Return 0 on success, raise an exception and return -1 on error. */
939PyAPI_FUNC(int)
940_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
941 Py_UCS4 ch
942 );
943
Victor Stinnere215d962012-10-06 23:03:36 +0200944/* Append a Unicode string.
945 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200946PyAPI_FUNC(int)
Victor Stinnere215d962012-10-06 23:03:36 +0200947_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
948 PyObject *str /* Unicode string */
949 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200950
Victor Stinnercfc4c132013-04-03 01:48:39 +0200951/* Append a substring of a Unicode string.
952 Return 0 on success, raise an exception and return -1 on error. */
953PyAPI_FUNC(int)
954_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
955 PyObject *str, /* Unicode string */
956 Py_ssize_t start,
957 Py_ssize_t end
958 );
959
Serhiy Storchakad65c9492015-11-02 14:10:23 +0200960/* Append an ASCII-encoded byte string.
Victor Stinner4a587072013-11-19 12:54:53 +0100961 Return 0 on success, raise an exception and return -1 on error. */
962PyAPI_FUNC(int)
963_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
964 const char *str, /* ASCII-encoded byte string */
965 Py_ssize_t len /* number of bytes, or -1 if unknown */
966 );
967
Victor Stinnere215d962012-10-06 23:03:36 +0200968/* Append a latin1-encoded byte string.
969 Return 0 on success, raise an exception and return -1 on error. */
970PyAPI_FUNC(int)
Victor Stinner4a587072013-11-19 12:54:53 +0100971_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
972 const char *str, /* latin1-encoded byte string */
973 Py_ssize_t len /* length in bytes */
Victor Stinnere215d962012-10-06 23:03:36 +0200974 );
975
Martin Panter6245cb32016-04-15 02:14:19 +0000976/* Get the value of the writer as a Unicode string. Clear the
Victor Stinnere215d962012-10-06 23:03:36 +0200977 buffer of the writer. Raise an exception and return NULL
978 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200979PyAPI_FUNC(PyObject *)
980_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
981
Victor Stinnere215d962012-10-06 23:03:36 +0200982/* Deallocate memory of a writer (clear its internal buffer). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200983PyAPI_FUNC(void)
984_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
985#endif
986
987#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000988/* Format the object based on the format_spec, as defined in PEP 3101
989 (Advanced String Formatting). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200990PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
991 _PyUnicodeWriter *writer,
992 PyObject *obj,
993 PyObject *format_spec,
994 Py_ssize_t start,
995 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000996#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000997
Walter Dörwald16807132007-05-25 13:52:07 +0000998PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
999PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001000PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1001 const char *u /* UTF-8 encoded string */
1002 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001003#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +00001004PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001005#endif
Walter Dörwald16807132007-05-25 13:52:07 +00001006
1007/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008#define PyUnicode_CHECK_INTERNED(op) \
1009 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +00001010
Guido van Rossumd8225182000-03-10 22:33:05 +00001011/* --- wchar_t support for platforms which support it --------------------- */
1012
1013#ifdef HAVE_WCHAR_H
1014
Georg Brandl952867a2010-06-27 10:17:12 +00001015/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 size.
1017
1018 The buffer is copied into the new object. */
1019
Mark Hammond91a681d2002-08-12 07:21:58 +00001020PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001021 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001023 );
1024
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001025/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +00001026 most size wchar_t characters are copied.
1027
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001028 Note that the resulting wchar_t string may or may not be
1029 0-terminated. It is the responsibility of the caller to make sure
1030 that the wchar_t string is 0-terminated in case this is required by
1031 the application.
1032
1033 Returns the number of wchar_t characters copied (excluding a
1034 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +00001035 error. */
1036
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001038 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001039 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001040 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001041 );
1042
Victor Stinner137c34c2010-09-29 10:25:54 +00001043/* Convert the Unicode object to a wide character string. The output string
1044 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +02001045 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +00001046
Victor Stinner22fabe22015-02-11 18:17:56 +01001047 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +00001048 on success. On error, returns NULL, *size is undefined and raises a
1049 MemoryError. */
1050
1051PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001052 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +00001053 Py_ssize_t *size /* number of characters of the result */
1054 );
1055
Victor Stinner9f789e72011-10-01 03:57:28 +02001056#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +02001058#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059
Guido van Rossumd8225182000-03-10 22:33:05 +00001060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062/* --- Unicode ordinals --------------------------------------------------- */
1063
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064/* Create a Unicode Object from the given Unicode code point ordinal.
1065
Ezio Melottie7f90372012-10-05 03:33:31 +03001066 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067 raised in case it is not.
1068
1069*/
1070
Marc-André Lemburg9c329de2002-08-12 08:19:10 +00001071PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001073/* --- Free-list management ----------------------------------------------- */
1074
1075/* Clear the free list used by the Unicode implementation.
1076
1077 This can be used to release memory used for objects on the free
1078 list back to the Python memory allocator.
1079
1080*/
1081
1082PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1083
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001084/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +00001085
1086 Many of these APIs take two arguments encoding and errors. These
1087 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001088 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +00001089
Georg Brandl952867a2010-06-27 10:17:12 +00001090 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +00001091
1092 Error handling is set by errors which may also be set to NULL
1093 meaning to use the default handling defined for the codec. Default
1094 error handling for all builtin codecs is "strict" (ValueErrors are
1095 raised).
1096
1097 The codecs all use a similar interface. Only deviation from the
1098 generic ones are documented.
1099
1100*/
1101
Fred Drakecb093fe2000-05-09 19:51:53 +00001102/* --- Manage the default encoding ---------------------------------------- */
1103
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001104/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001105 Unicode object unicode and the size of the encoded representation
1106 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +00001107
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001108 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001109
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001110 This function caches the UTF-8 encoded string in the unicodeobject
1111 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 when the unicodeobject is deallocated.
1113
1114 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1115 support the previous internal function with the same behaviour.
1116
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001117 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001118 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001119
1120 *** If you need to access the Unicode object as UTF-8 bytes string,
1121 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +00001122*/
1123
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001124#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001127 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001129#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001130
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001131/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001132 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1135 in the unicodeobject.
1136
1137 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1138 support the previous internal function with the same behaviour.
1139
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001140 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001141 extracted from the returned data.
1142
1143 *** This API is for interpreter INTERNAL USE ONLY and will likely
1144 *** be removed or changed for Python 3.1.
1145
1146 *** If you need to access the Unicode object as UTF-8 bytes string,
1147 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001148
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001149*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001150
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001151#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1153#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001154#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001155
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001156/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001157
Mark Hammond91a681d2002-08-12 07:21:58 +00001158PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001159
Guido van Rossumd8225182000-03-10 22:33:05 +00001160/* --- Generic Codecs ----------------------------------------------------- */
1161
1162/* Create a Unicode object by decoding the encoded string s of the
1163 given size. */
1164
Mark Hammond91a681d2002-08-12 07:21:58 +00001165PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001166 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001168 const char *encoding, /* encoding */
1169 const char *errors /* error handling */
1170 );
1171
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001172/* Decode a Unicode object unicode and return the result as Python
1173 object. */
1174
1175PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001176 PyObject *unicode, /* Unicode object */
1177 const char *encoding, /* encoding */
1178 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001179 );
1180
1181/* Decode a Unicode object unicode and return the result as Unicode
1182 object. */
1183
1184PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 PyObject *unicode, /* Unicode object */
1186 const char *encoding, /* encoding */
1187 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001188 );
1189
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001191 Python string object. */
1192
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001193#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001194PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001195 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001196 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001197 const char *encoding, /* encoding */
1198 const char *errors /* error handling */
1199 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001200#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001201
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001202/* Encodes a Unicode object and returns the result as Python
1203 object. */
1204
1205PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 PyObject *unicode, /* Unicode object */
1207 const char *encoding, /* encoding */
1208 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001209 );
1210
Guido van Rossumd8225182000-03-10 22:33:05 +00001211/* Encodes a Unicode object and returns the result as Python string
1212 object. */
1213
Mark Hammond91a681d2002-08-12 07:21:58 +00001214PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 PyObject *unicode, /* Unicode object */
1216 const char *encoding, /* encoding */
1217 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001218 );
1219
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001220/* Encodes a Unicode object and returns the result as Unicode
1221 object. */
1222
1223PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 PyObject *unicode, /* Unicode object */
1225 const char *encoding, /* encoding */
1226 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001227 );
1228
1229/* Build an encoding map. */
1230
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001231PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1232 PyObject* string /* 256 character map */
1233 );
1234
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001235/* --- UTF-7 Codecs ------------------------------------------------------- */
1236
Mark Hammond91a681d2002-08-12 07:21:58 +00001237PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 const char *string, /* UTF-7 encoded string */
1239 Py_ssize_t length, /* size of string */
1240 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001241 );
1242
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001243PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 const char *string, /* UTF-7 encoded string */
1245 Py_ssize_t length, /* size of string */
1246 const char *errors, /* error handling */
1247 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001248 );
1249
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001250#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001251PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 const Py_UNICODE *data, /* Unicode char buffer */
1253 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1254 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1255 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1256 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001257 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001258PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1259 PyObject *unicode, /* Unicode object */
1260 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1261 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1262 const char *errors /* error handling */
1263 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001264#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001265
Guido van Rossumd8225182000-03-10 22:33:05 +00001266/* --- UTF-8 Codecs ------------------------------------------------------- */
1267
Mark Hammond91a681d2002-08-12 07:21:58 +00001268PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 const char *string, /* UTF-8 encoded string */
1270 Py_ssize_t length, /* size of string */
1271 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001272 );
1273
Walter Dörwald69652032004-09-07 20:24:22 +00001274PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 const char *string, /* UTF-8 encoded string */
1276 Py_ssize_t length, /* size of string */
1277 const char *errors, /* error handling */
1278 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001279 );
1280
Mark Hammond91a681d2002-08-12 07:21:58 +00001281PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001283 );
1284
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1287 PyObject *unicode,
1288 const char *errors);
1289
Mark Hammond91a681d2002-08-12 07:21:58 +00001290PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 const Py_UNICODE *data, /* Unicode char buffer */
1292 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1293 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001294 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001295#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001296
Walter Dörwald41980ca2007-08-16 21:55:45 +00001297/* --- UTF-32 Codecs ------------------------------------------------------ */
1298
1299/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1300 the corresponding Unicode object.
1301
1302 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001304
1305 If byteorder is non-NULL, the decoder starts decoding using the
1306 given byte order:
1307
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 *byteorder == -1: little endian
1309 *byteorder == 0: native order
1310 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001311
1312 In native mode, the first four bytes of the stream are checked for a
1313 BOM mark. If found, the BOM mark is analysed, the byte order
1314 adjusted and the BOM skipped. In the other modes, no BOM mark
1315 interpretation is done. After completion, *byteorder is set to the
1316 current byte order at the end of input data.
1317
1318 If byteorder is NULL, the codec starts in native order mode.
1319
1320*/
1321
1322PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 const char *string, /* UTF-32 encoded string */
1324 Py_ssize_t length, /* size of string */
1325 const char *errors, /* error handling */
1326 int *byteorder /* pointer to byteorder to use
1327 0=native;-1=LE,1=BE; updated on
1328 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001329 );
1330
1331PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 const char *string, /* UTF-32 encoded string */
1333 Py_ssize_t length, /* size of string */
1334 const char *errors, /* error handling */
1335 int *byteorder, /* pointer to byteorder to use
1336 0=native;-1=LE,1=BE; updated on
1337 exit */
1338 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001339 );
1340
1341/* Returns a Python string using the UTF-32 encoding in native byte
1342 order. The string always starts with a BOM mark. */
1343
1344PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001346 );
1347
1348/* Returns a Python string object holding the UTF-32 encoded value of
1349 the Unicode data.
1350
1351 If byteorder is not 0, output is written according to the following
1352 byte order:
1353
1354 byteorder == -1: little endian
1355 byteorder == 0: native byte order (writes a BOM mark)
1356 byteorder == 1: big endian
1357
1358 If byteorder is 0, the output string will always start with the
1359 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1360 prepended.
1361
1362*/
1363
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001364#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001365PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 const Py_UNICODE *data, /* Unicode char buffer */
1367 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1368 const char *errors, /* error handling */
1369 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001370 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001371PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1372 PyObject *object, /* Unicode object */
1373 const char *errors, /* error handling */
1374 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1375 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001376#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001377
Guido van Rossumd8225182000-03-10 22:33:05 +00001378/* --- UTF-16 Codecs ------------------------------------------------------ */
1379
Guido van Rossum9e896b32000-04-05 20:11:21 +00001380/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001381 the corresponding Unicode object.
1382
1383 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001385
1386 If byteorder is non-NULL, the decoder starts decoding using the
1387 given byte order:
1388
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 *byteorder == -1: little endian
1390 *byteorder == 0: native order
1391 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001392
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001393 In native mode, the first two bytes of the stream are checked for a
1394 BOM mark. If found, the BOM mark is analysed, the byte order
1395 adjusted and the BOM skipped. In the other modes, no BOM mark
1396 interpretation is done. After completion, *byteorder is set to the
1397 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001398
1399 If byteorder is NULL, the codec starts in native order mode.
1400
1401*/
1402
Mark Hammond91a681d2002-08-12 07:21:58 +00001403PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 const char *string, /* UTF-16 encoded string */
1405 Py_ssize_t length, /* size of string */
1406 const char *errors, /* error handling */
1407 int *byteorder /* pointer to byteorder to use
1408 0=native;-1=LE,1=BE; updated on
1409 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001410 );
1411
Walter Dörwald69652032004-09-07 20:24:22 +00001412PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 const char *string, /* UTF-16 encoded string */
1414 Py_ssize_t length, /* size of string */
1415 const char *errors, /* error handling */
1416 int *byteorder, /* pointer to byteorder to use
1417 0=native;-1=LE,1=BE; updated on
1418 exit */
1419 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001420 );
1421
Guido van Rossumd8225182000-03-10 22:33:05 +00001422/* Returns a Python string using the UTF-16 encoding in native byte
1423 order. The string always starts with a BOM mark. */
1424
Mark Hammond91a681d2002-08-12 07:21:58 +00001425PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001427 );
1428
1429/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001430 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001431
1432 If byteorder is not 0, output is written according to the following
1433 byte order:
1434
1435 byteorder == -1: little endian
1436 byteorder == 0: native byte order (writes a BOM mark)
1437 byteorder == 1: big endian
1438
1439 If byteorder is 0, the output string will always start with the
1440 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1441 prepended.
1442
1443 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1444 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001445 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001446
1447*/
1448
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001449#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001450PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001451 const Py_UNICODE *data, /* Unicode char buffer */
1452 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1453 const char *errors, /* error handling */
1454 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001455 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001456PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1457 PyObject* unicode, /* Unicode object */
1458 const char *errors, /* error handling */
1459 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1460 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001461#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001462
1463/* --- Unicode-Escape Codecs ---------------------------------------------- */
1464
Mark Hammond91a681d2002-08-12 07:21:58 +00001465PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001466 const char *string, /* Unicode-Escape encoded string */
1467 Py_ssize_t length, /* size of string */
1468 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001469 );
1470
Mark Hammond91a681d2002-08-12 07:21:58 +00001471PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001473 );
1474
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001475#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001476PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 const Py_UNICODE *data, /* Unicode char buffer */
1478 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001479 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001480#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001481
1482/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1483
Mark Hammond91a681d2002-08-12 07:21:58 +00001484PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 const char *string, /* Raw-Unicode-Escape encoded string */
1486 Py_ssize_t length, /* size of string */
1487 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001488 );
1489
Mark Hammond91a681d2002-08-12 07:21:58 +00001490PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001491 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001492 );
1493
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001494#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001495PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 const Py_UNICODE *data, /* Unicode char buffer */
1497 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001498 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001499#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001501/* --- Unicode Internal Codec ---------------------------------------------
1502
1503 Only for internal use in _codecsmodule.c */
1504
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001505#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001506PyObject *_PyUnicode_DecodeUnicodeInternal(
1507 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001508 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001509 const char *errors
1510 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001511#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001512
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001514
1515 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1516
1517*/
1518
Mark Hammond91a681d2002-08-12 07:21:58 +00001519PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 const char *string, /* Latin-1 encoded string */
1521 Py_ssize_t length, /* size of string */
1522 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001523 );
1524
Mark Hammond91a681d2002-08-12 07:21:58 +00001525PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001526 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001527 );
1528
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001529#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1531 PyObject* unicode,
1532 const char* errors);
1533
Mark Hammond91a681d2002-08-12 07:21:58 +00001534PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001535 const Py_UNICODE *data, /* Unicode char buffer */
1536 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1537 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001538 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001539#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001540
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001541/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001542
1543 Only 7-bit ASCII data is excepted. All other codes generate errors.
1544
1545*/
1546
Mark Hammond91a681d2002-08-12 07:21:58 +00001547PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001548 const char *string, /* ASCII encoded string */
1549 Py_ssize_t length, /* size of string */
1550 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001551 );
1552
Mark Hammond91a681d2002-08-12 07:21:58 +00001553PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001555 );
1556
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001557#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1559 PyObject* unicode,
1560 const char* errors);
1561
Mark Hammond91a681d2002-08-12 07:21:58 +00001562PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 const Py_UNICODE *data, /* Unicode char buffer */
1564 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1565 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001566 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001567#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001568
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001569/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001570
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001571 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001572
1573 Decoding mappings must map single string characters to single
1574 Unicode characters, integers (which are then interpreted as Unicode
1575 ordinals) or None (meaning "undefined mapping" and causing an
1576 error).
1577
1578 Encoding mappings must map single Unicode characters to single
1579 string characters, integers (which are then interpreted as Latin-1
1580 ordinals) or None (meaning "undefined mapping" and causing an
1581 error).
1582
1583 If a character lookup fails with a LookupError, the character is
1584 copied as-is meaning that its ordinal value will be interpreted as
1585 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1586 to contain those mappings which map characters to different code
1587 points.
1588
1589*/
1590
Mark Hammond91a681d2002-08-12 07:21:58 +00001591PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001592 const char *string, /* Encoded string */
1593 Py_ssize_t length, /* size of string */
1594 PyObject *mapping, /* character mapping
1595 (char ordinal -> unicode ordinal) */
1596 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001597 );
1598
Mark Hammond91a681d2002-08-12 07:21:58 +00001599PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600 PyObject *unicode, /* Unicode object */
1601 PyObject *mapping /* character mapping
1602 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001603 );
1604
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001605#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001606PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 const Py_UNICODE *data, /* Unicode char buffer */
1608 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1609 PyObject *mapping, /* character mapping
1610 (unicode ordinal -> char ordinal) */
1611 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001612 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001613PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1614 PyObject *unicode, /* Unicode object */
1615 PyObject *mapping, /* character mapping
1616 (unicode ordinal -> char ordinal) */
1617 const char *errors /* error handling */
1618 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001619#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001620
1621/* Translate a Py_UNICODE buffer of the given length by applying a
1622 character mapping table to it and return the resulting Unicode
1623 object.
1624
1625 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001626 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001627
1628 Mapping tables may be dictionaries or sequences. Unmapped character
1629 ordinals (ones which cause a LookupError) are left untouched and
1630 are copied as-is.
1631
1632*/
1633
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001634#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001635PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001636 const Py_UNICODE *data, /* Unicode char buffer */
1637 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1638 PyObject *table, /* Translate table */
1639 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001640 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001641#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001642
Victor Stinner99b95382011-07-04 14:23:54 +02001643#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001644
Guido van Rossumefec1152000-03-28 02:01:15 +00001645/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001646
Mark Hammond91a681d2002-08-12 07:21:58 +00001647PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001648 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001649 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001650 const char *errors /* error handling */
1651 );
1652
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001653PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1654 const char *string, /* MBCS encoded string */
1655 Py_ssize_t length, /* size of string */
1656 const char *errors, /* error handling */
1657 Py_ssize_t *consumed /* bytes consumed */
1658 );
1659
Victor Stinner3a50e702011-10-18 21:21:00 +02001660PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1661 int code_page, /* code page number */
1662 const char *string, /* encoded string */
1663 Py_ssize_t length, /* size of string */
1664 const char *errors, /* error handling */
1665 Py_ssize_t *consumed /* bytes consumed */
1666 );
1667
Mark Hammond91a681d2002-08-12 07:21:58 +00001668PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001669 PyObject *unicode /* Unicode object */
1670 );
1671
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001672#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001673PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001674 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001675 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001676 const char *errors /* error handling */
1677 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001678#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001679
Victor Stinner3a50e702011-10-18 21:21:00 +02001680PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1681 int code_page, /* code page number */
1682 PyObject *unicode, /* Unicode object */
1683 const char *errors /* error handling */
1684 );
1685
Victor Stinner99b95382011-07-04 14:23:54 +02001686#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001687
Guido van Rossum9e896b32000-04-05 20:11:21 +00001688/* --- Decimal Encoder ---------------------------------------------------- */
1689
1690/* Takes a Unicode string holding a decimal value and writes it into
1691 an output buffer using standard ASCII digit codes.
1692
1693 The output buffer has to provide at least length+1 bytes of storage
1694 area. The output string is 0-terminated.
1695
1696 The encoder converts whitespace to ' ', decimal characters to their
1697 corresponding ASCII digit and all other Latin-1 characters except
1698 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1699 are treated as errors. This includes embedded NULL bytes.
1700
1701 Error handling is defined by the errors argument:
1702
1703 NULL or "strict": raise a ValueError
1704 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001705 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001706 "replace": replaces illegal characters with '?'
1707
1708 Returns 0 on success, -1 on failure.
1709
1710*/
1711
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001712#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001713PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 Py_UNICODE *s, /* Unicode buffer */
1715 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1716 char *output, /* Output buffer; must have size >= length */
1717 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001718 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001719#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001720
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001721/* Transforms code points that have decimal digit property to the
1722 corresponding ASCII digit code points.
1723
1724 Returns a new Unicode string on success, NULL on failure.
1725*/
1726
Georg Brandlb5503082010-12-05 11:40:48 +00001727#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001728PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1729 Py_UNICODE *s, /* Unicode buffer */
1730 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1731 );
Georg Brandlb5503082010-12-05 11:40:48 +00001732#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001733
Victor Stinner6f9568b2011-11-17 00:12:44 +01001734/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 as argument instead of a raw buffer and length. This function additionally
1736 transforms spaces to ASCII because this is what the callers in longobject,
1737 floatobject, and complexobject did anyways. */
1738
1739#ifndef Py_LIMITED_API
1740PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1741 PyObject *unicode /* Unicode object */
1742 );
1743#endif
1744
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001745/* --- Locale encoding --------------------------------------------------- */
1746
1747/* Decode a string from the current locale encoding. The decoder is strict if
1748 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1749 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1750 be decoded as a surrogate character and *surrogateescape* is not equal to
1751 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1752 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001753 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001754
1755PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1756 const char *str,
1757 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001758 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001759
1760/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1761 length using strlen(). */
1762
1763PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1764 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001765 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001766
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001767/* Encode a Unicode object to the current locale encoding. The encoder is
1768 strict is *surrogateescape* is equal to zero, otherwise the
1769 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +01001770 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001771
1772PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1773 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001774 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001775 );
1776
Martin v. Löwis011e8422009-05-05 04:43:17 +00001777/* --- File system encoding ---------------------------------------------- */
1778
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001779/* ParseTuple converter: encode str objects to bytes using
1780 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001781
1782PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1783
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001784/* ParseTuple converter: decode bytes objects to unicode using
1785 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1786
1787PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1788
Victor Stinner77c38622010-05-14 15:58:55 +00001789/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1790 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001791
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001792 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1793 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001794
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001795 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001796*/
1797
1798PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1799 const char *s /* encoded string */
1800 );
1801
Victor Stinner77c38622010-05-14 15:58:55 +00001802/* Decode a string using Py_FileSystemDefaultEncoding
1803 and the "surrogateescape" error handler.
1804
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001805 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1806 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001807*/
1808
Martin v. Löwis011e8422009-05-05 04:43:17 +00001809PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1810 const char *s, /* encoded string */
1811 Py_ssize_t size /* size */
1812 );
1813
Victor Stinnerae6265f2010-05-15 16:27:27 +00001814/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001815 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001816
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001817 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1818 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001819*/
1820
1821PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1822 PyObject *unicode
1823 );
1824
Guido van Rossumd8225182000-03-10 22:33:05 +00001825/* --- Methods & Slots ----------------------------------------------------
1826
1827 These are capable of handling Unicode objects and strings on input
1828 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001829 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001830
1831/* Concat two strings giving a new Unicode string. */
1832
Mark Hammond91a681d2002-08-12 07:21:58 +00001833PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001834 PyObject *left, /* Left string */
1835 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001836 );
1837
Walter Dörwald1ab83302007-05-18 17:15:44 +00001838/* Concat two strings and put the result in *pleft
1839 (sets *pleft to NULL on error) */
1840
1841PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001842 PyObject **pleft, /* Pointer to left string */
1843 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001844 );
1845
1846/* Concat two strings, put the result in *pleft and drop the right object
1847 (sets *pleft to NULL on error) */
1848
1849PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001850 PyObject **pleft, /* Pointer to left string */
1851 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001852 );
1853
Guido van Rossumd8225182000-03-10 22:33:05 +00001854/* Split a string giving a list of Unicode strings.
1855
1856 If sep is NULL, splitting will be done at all whitespace
1857 substrings. Otherwise, splits occur at the given separator.
1858
1859 At most maxsplit splits will be done. If negative, no limit is set.
1860
1861 Separators are not included in the resulting list.
1862
1863*/
1864
Mark Hammond91a681d2002-08-12 07:21:58 +00001865PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001866 PyObject *s, /* String to split */
1867 PyObject *sep, /* String separator */
1868 Py_ssize_t maxsplit /* Maxsplit count */
1869 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001870
1871/* Dito, but split at line breaks.
1872
1873 CRLF is considered to be one line break. Line breaks are not
1874 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001875
Mark Hammond91a681d2002-08-12 07:21:58 +00001876PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001877 PyObject *s, /* String to split */
1878 int keepends /* If true, line end markers are included */
1879 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001880
Thomas Wouters477c8d52006-05-27 19:21:47 +00001881/* Partition a string using a given separator. */
1882
1883PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001884 PyObject *s, /* String to partition */
1885 PyObject *sep /* String separator */
1886 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001887
1888/* Partition a string using a given separator, searching from the end of the
1889 string. */
1890
1891PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001892 PyObject *s, /* String to partition */
1893 PyObject *sep /* String separator */
1894 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001895
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001896/* Split a string giving a list of Unicode strings.
1897
1898 If sep is NULL, splitting will be done at all whitespace
1899 substrings. Otherwise, splits occur at the given separator.
1900
1901 At most maxsplit splits will be done. But unlike PyUnicode_Split
1902 PyUnicode_RSplit splits from the end of the string. If negative,
1903 no limit is set.
1904
1905 Separators are not included in the resulting list.
1906
1907*/
1908
1909PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001910 PyObject *s, /* String to split */
1911 PyObject *sep, /* String separator */
1912 Py_ssize_t maxsplit /* Maxsplit count */
1913 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001914
Guido van Rossumd8225182000-03-10 22:33:05 +00001915/* Translate a string by applying a character mapping table to it and
1916 return the resulting Unicode object.
1917
1918 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001919 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001920
1921 Mapping tables may be dictionaries or sequences. Unmapped character
1922 ordinals (ones which cause a LookupError) are left untouched and
1923 are copied as-is.
1924
1925*/
1926
Mark Hammond91a681d2002-08-12 07:21:58 +00001927PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001928 PyObject *str, /* String */
1929 PyObject *table, /* Translate table */
1930 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001931 );
1932
1933/* Join a sequence of strings using the given separator and return
1934 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001935
Mark Hammond91a681d2002-08-12 07:21:58 +00001936PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001937 PyObject *separator, /* Separator string */
1938 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001939 );
1940
1941/* Return 1 if substr matches str[start:end] at the given tail end, 0
1942 otherwise. */
1943
Martin v. Löwis18e16552006-02-15 17:27:45 +00001944PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001945 PyObject *str, /* String */
1946 PyObject *substr, /* Prefix or Suffix string */
1947 Py_ssize_t start, /* Start index */
1948 Py_ssize_t end, /* Stop index */
1949 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001950 );
1951
1952/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001953 given search direction or -1 if not found. -2 is returned in case
1954 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001955
Martin v. Löwis18e16552006-02-15 17:27:45 +00001956PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001957 PyObject *str, /* String */
1958 PyObject *substr, /* Substring to find */
1959 Py_ssize_t start, /* Start index */
1960 Py_ssize_t end, /* Stop index */
1961 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001962 );
1963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964/* Like PyUnicode_Find, but search for single character only. */
1965PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1966 PyObject *str,
1967 Py_UCS4 ch,
1968 Py_ssize_t start,
1969 Py_ssize_t end,
1970 int direction
1971 );
1972
Barry Warsaw51ac5802000-03-20 16:36:48 +00001973/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001974
Martin v. Löwis18e16552006-02-15 17:27:45 +00001975PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001976 PyObject *str, /* String */
1977 PyObject *substr, /* Substring to count */
1978 Py_ssize_t start, /* Start index */
1979 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001980 );
1981
Barry Warsaw51ac5802000-03-20 16:36:48 +00001982/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001983 and return the resulting Unicode object. */
1984
Mark Hammond91a681d2002-08-12 07:21:58 +00001985PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001986 PyObject *str, /* String */
1987 PyObject *substr, /* Substring to find */
1988 PyObject *replstr, /* Substring to replace */
1989 Py_ssize_t maxcount /* Max. number of replacements to apply;
1990 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001991 );
1992
1993/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +02001994 greater than resp.
1995 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001996
Mark Hammond91a681d2002-08-12 07:21:58 +00001997PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001998 PyObject *left, /* Left string */
1999 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00002000 );
2001
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002002#ifndef Py_LIMITED_API
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002003/* Compare a string with an identifier and return -1, 0, 1 for less than,
2004 equal, and greater than, respectively.
2005 Raise an exception and return -1 on error. */
2006
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002007PyAPI_FUNC(int) _PyUnicode_CompareWithId(
2008 PyObject *left, /* Left string */
2009 _Py_Identifier *right /* Right identifier */
2010 );
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002011
2012/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
2013 0 otherwise. Return 0 if any argument contains non-ASCII characters.
2014 Any error occurs inside will be cleared before return. */
2015
2016PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
2017 PyObject *left, /* Left string */
2018 _Py_Identifier *right /* Right identifier */
2019 );
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002020#endif
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002021
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002022/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
2023 equal, and greater than, respectively. It is best to pass only
2024 ASCII-encoded strings, but the function interprets the input string as
2025 ISO-8859-1 if it contains non-ASCII characters.
Serhiy Storchaka419967b2016-12-06 00:13:34 +02002026 This function does not raise exceptions. */
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +02002027
Martin v. Löwis5b222132007-06-10 09:51:05 +00002028PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2029 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00002030 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00002031 );
2032
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +02002033#ifndef Py_LIMITED_API
2034/* Test whether a unicode is equal to ASCII string. Return 1 if true,
2035 0 otherwise. Return 0 if any argument contains non-ASCII characters.
2036 Any error occurs inside will be cleared before return. */
2037
2038PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
2039 PyObject *left,
2040 const char *right /* ASCII-encoded string */
2041 );
2042#endif
2043
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002044/* Rich compare two strings and return one of the following:
2045
2046 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +00002047 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002048 - Py_NotImplemented in case the type combination is unknown
2049
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002050 Possible values for op:
2051
2052 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2053
2054*/
2055
2056PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002057 PyObject *left, /* Left string */
2058 PyObject *right, /* Right string */
2059 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002060 );
2061
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002062/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00002063 the resulting Unicode string. */
2064
Mark Hammond91a681d2002-08-12 07:21:58 +00002065PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002066 PyObject *format, /* Format string */
2067 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00002068 );
2069
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002070/* Checks whether element is contained in container and return 1/0
2071 accordingly.
2072
Martin Pantercc71a792016-04-05 06:19:42 +00002073 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002074 returned in case of an error. */
2075
Mark Hammond91a681d2002-08-12 07:21:58 +00002076PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002077 PyObject *container, /* Container string */
2078 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002079 );
2080
Martin v. Löwis47383402007-08-15 07:32:56 +00002081/* Checks whether argument is a valid identifier. */
2082
2083PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2084
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002085#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002086/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00002087PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002088 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002089 int striptype,
2090 PyObject *sepobj
2091 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002092#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002093
Eric Smitha3b1ac82009-04-03 14:45:06 +00002094/* Using explicit passed-in values, insert the thousands grouping
2095 into the string pointed to by buffer. For the argument descriptions,
2096 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002097#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02002099 PyObject *unicode,
Victor Stinner41a863c2012-02-24 00:37:51 +01002100 Py_ssize_t index,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 Py_ssize_t n_buffer,
2102 void *digits,
2103 Py_ssize_t n_digits,
2104 Py_ssize_t min_width,
2105 const char *grouping,
Victor Stinner41a863c2012-02-24 00:37:51 +01002106 PyObject *thousands_sep,
2107 Py_UCS4 *maxchar);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002108#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002109/* === Characters Type APIs =============================================== */
2110
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002111/* Helper array used by Py_UNICODE_ISSPACE(). */
2112
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002113#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002114PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2115
Guido van Rossumd8225182000-03-10 22:33:05 +00002116/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002117 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00002118
2119 These APIs are implemented in Objects/unicodectype.c.
2120
2121*/
2122
Mark Hammond91a681d2002-08-12 07:21:58 +00002123PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002124 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002125 );
2126
Mark Hammond91a681d2002-08-12 07:21:58 +00002127PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002128 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002129 );
2130
Mark Hammond91a681d2002-08-12 07:21:58 +00002131PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002132 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002133 );
2134
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002135PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002136 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002137 );
2138
2139PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002140 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002141 );
2142
Mark Hammond91a681d2002-08-12 07:21:58 +00002143PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002144 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002145 );
2146
Mark Hammond91a681d2002-08-12 07:21:58 +00002147PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002148 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002149 );
2150
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002151PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2152 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002153 );
2154
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002155PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2156 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002157 );
2158
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002159PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2160 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002161 );
2162
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002163PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2164 Py_UCS4 ch, /* Unicode character */
2165 Py_UCS4 *res
2166 );
2167
2168PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2169 Py_UCS4 ch, /* Unicode character */
2170 Py_UCS4 *res
2171 );
2172
2173PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2174 Py_UCS4 ch, /* Unicode character */
2175 Py_UCS4 *res
2176 );
2177
Benjamin Petersond5890c82012-01-14 13:23:30 -05002178PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2179 Py_UCS4 ch, /* Unicode character */
2180 Py_UCS4 *res
2181 );
2182
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002183PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002184 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002185 );
2186
2187PyAPI_FUNC(int) _PyUnicode_IsCased(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002188 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002189 );
2190
Mark Hammond91a681d2002-08-12 07:21:58 +00002191PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002192 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002193 );
2194
Mark Hammond91a681d2002-08-12 07:21:58 +00002195PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002196 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002197 );
2198
Mark Hammond91a681d2002-08-12 07:21:58 +00002199PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002200 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002201 );
2202
Mark Hammond91a681d2002-08-12 07:21:58 +00002203PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002204 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002205 );
2206
Mark Hammond91a681d2002-08-12 07:21:58 +00002207PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002208 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002209 );
2210
Mark Hammond91a681d2002-08-12 07:21:58 +00002211PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002212 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002213 );
2214
Georg Brandl559e5d72008-06-11 18:37:52 +00002215PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002216 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002217 );
2218
Mark Hammond91a681d2002-08-12 07:21:58 +00002219PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002220 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002221 );
2222
Victor Stinneref8d95c2010-08-16 22:03:11 +00002223PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2224 const Py_UNICODE *u
2225 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002226
2227PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002228 Py_UNICODE *s1,
2229 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002230
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002231PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2232 Py_UNICODE *s1, const Py_UNICODE *s2);
2233
Martin v. Löwis5b222132007-06-10 09:51:05 +00002234PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002235 Py_UNICODE *s1,
2236 const Py_UNICODE *s2,
2237 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002238
2239PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002240 const Py_UNICODE *s1,
2241 const Py_UNICODE *s2
2242 );
2243
2244PyAPI_FUNC(int) Py_UNICODE_strncmp(
2245 const Py_UNICODE *s1,
2246 const Py_UNICODE *s2,
2247 size_t n
2248 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002249
2250PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002251 const Py_UNICODE *s,
2252 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002253 );
2254
Victor Stinner331ea922010-08-10 16:37:20 +00002255PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002256 const Py_UNICODE *s,
2257 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002258 );
2259
Ethan Furmanb95b5612015-01-23 20:05:18 -08002260PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2261
Victor Stinner71133ff2010-09-01 23:43:53 +00002262/* Create a copy of a unicode string ending with a nul character. Return NULL
2263 and raise a MemoryError exception on memory allocation failure, otherwise
2264 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2265
Victor Stinner46408602010-09-03 16:18:00 +00002266PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002267 PyObject *unicode
2268 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002269#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002270
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002271#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002272PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002273 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002274 int check_content);
2275#endif
2276
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002277/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2278PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2279/* Clear all static strings. */
2280PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2281
Guido van Rossumd8225182000-03-10 22:33:05 +00002282#ifdef __cplusplus
2283}
2284#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002285#endif /* !Py_UNICODEOBJECT_H */