blob: b6d331bb8d3d7582ce463bdc870edf4087dca922 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
Victor Stinner76df43d2012-10-30 01:42:39 +0100183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
Victor Stinner551ac952011-11-29 22:58:13 +0100192/* low surrogate = bottom 10 bits added to DC00 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100223 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200224 * kind = PyUnicode_1BYTE_KIND
225 * compact = 1
226 * ascii = 1
227 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200228 * (length is the length of the utf8 and wstr strings)
229 * (data starts just after the structure)
230 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200231
232 - compact:
233
234 * structure = PyCompactUnicodeObject
Victor Stinner80bc72d2011-12-22 03:23:10 +0100235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237 PyUnicode_4BYTE_KIND
238 * compact = 1
239 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200240 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200242 * utf8_length = 0 if utf8 is NULL
243 * wstr is shared with data and wstr_length=length
244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200246 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200247 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100253 * length = 0 (use wstr_length)
254 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * kind = PyUnicode_WCHAR_KIND
256 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200257 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200258 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100259 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200260 * wstr is not NULL
261 * data.any is NULL
262 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200263 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200266
267 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270 PyUnicode_4BYTE_KIND
271 * compact = 0
272 * ready = 1
273 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200274 * utf8 is shared and utf8_length = length with data.any if ascii = 1
275 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100276 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Compact strings use only one memory block (structure + characters),
282 whereas legacy strings use one block for the structure and one block
283 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200284
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200285 Legacy strings are created by PyUnicode_FromUnicode() and
286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287 when PyUnicode_READY() is called.
288
289 See also _PyUnicode_CheckConsistency().
290 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000291 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000293 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 struct {
295 /*
296 SSTATE_NOT_INTERNED (0)
297 SSTATE_INTERNED_MORTAL (1)
298 SSTATE_INTERNED_IMMORTAL (2)
299
300 If interned != SSTATE_NOT_INTERNED, the two references from the
301 dictionary to this object are *not* counted in ob_refcnt.
302 */
303 unsigned int interned:2;
304 /* Character size:
305
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200306 - PyUnicode_WCHAR_KIND (0):
307
308 * character type = wchar_t (16 or 32 bits, depending on the
309 platform)
310
311 - PyUnicode_1BYTE_KIND (1):
312
313 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+00FF (latin1)
315 * if ascii is set, all characters are in the range U+0000-U+007F
316 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200317 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 - PyUnicode_2BYTE_KIND (2):
320
321 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100322 * all characters are in the range U+0000-U+FFFF (BMP)
323 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200324
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200325 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200326
327 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100328 * all characters are in the range U+0000-U+10FFFF
329 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200331 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 /* Compact is with respect to the allocation scheme. Compact unicode
333 objects only require one memory block while non-compact objects use
334 one block for the PyUnicodeObject struct and another for its data
335 buffer. */
336 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100337 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200340 unsigned int ascii:1;
341 /* The ready flag indicates whether the object layout is initialized
342 completely. This means that this is either a compact object, or
343 the data pointer is filled out. The bit is redundant, and helps
344 to minimize the test in PyUnicode_IS_READY(). */
345 unsigned int ready:1;
346 } state;
347 wchar_t *wstr; /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200351 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200352 immediately follow the structure. */
353typedef struct {
354 PyASCIIObject _base;
355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
356 * terminating \0. */
357 char *utf8; /* UTF-8 representation (null-terminated) */
358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
359 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200364 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365typedef struct {
366 PyCompactUnicodeObject _base;
367 union {
368 void *any;
369 Py_UCS1 *latin1;
370 Py_UCS2 *ucs2;
371 Py_UCS4 *ucs4;
372 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000373} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000374#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000375
Mark Hammond91a681d2002-08-12 07:21:58 +0000376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
Thomas Wouters27d517b2007-02-25 20:39:11 +0000379#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000382
383/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000384#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200385
386#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200387 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 ((PyASCIIObject*)op)->length : \
389 ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392 (this includes surrogate pairs as 2 units).
393 If the Py_UNICODE representation is not available, it will be computed
394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
395
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100396#define PyUnicode_GET_SIZE(op) \
397 (assert(PyUnicode_Check(op)), \
398 (((PyASCIIObject *)(op))->wstr) ? \
399 PyUnicode_WSTR_LENGTH(op) : \
400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
401 assert(((PyASCIIObject *)(op))->wstr), \
402 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
408 representation on demand. Using this macro is very inefficient now,
409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410 use PyUnicode_WRITE() and PyUnicode_READ(). */
411
Guido van Rossumd8225182000-03-10 22:33:05 +0000412#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413 (assert(PyUnicode_Check(op)), \
414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415 PyUnicode_AsUnicode((PyObject *)(op)))
416
Guido van Rossumd8225182000-03-10 22:33:05 +0000417#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418 ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200422
Victor Stinner6f9568b2011-11-17 00:12:44 +0100423/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
Victor Stinnera3b334d2011-10-03 13:53:37 +0200430/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432 ready. */
433#define PyUnicode_IS_ASCII(op) \
434 (assert(PyUnicode_Check(op)), \
435 assert(PyUnicode_IS_READY(op)), \
436 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200437
438/* Return true if the string is compact or 0 if not.
439 No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441 (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444 structure), or 0 if not. No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447
Victor Stinner52e2cc82011-12-19 22:14:45 +0100448enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200450 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451 has not been called yet. */
Victor Stinner52e2cc82011-12-19 22:14:45 +0100452 PyUnicode_WCHAR_KIND = 0,
453/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200454 PyUnicode_1BYTE_KIND = 1,
455 PyUnicode_2BYTE_KIND = 2,
456 PyUnicode_4BYTE_KIND = 4
457};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458
Georg Brandl4975a9b2011-10-05 16:12:21 +0200459/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200461 No checks are performed, use PyUnicode_KIND() before to ensure
462 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
467
Victor Stinner157f83f2011-09-28 21:41:31 +0200468/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200469#define PyUnicode_KIND(op) \
470 (assert(PyUnicode_Check(op)), \
471 assert(PyUnicode_IS_READY(op)), \
472 ((PyASCIIObject *)(op))->state.kind)
473
Victor Stinner157f83f2011-09-28 21:41:31 +0200474/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200476 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 ((void*)((PyASCIIObject*)(op) + 1)) : \
478 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
479
480#define _PyUnicode_NONCOMPACT_DATA(op) \
481 (assert(((PyUnicodeObject*)(op))->data.any), \
482 ((((PyUnicodeObject *)(op))->data.any)))
483
484#define PyUnicode_DATA(op) \
485 (assert(PyUnicode_Check(op)), \
486 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
487 _PyUnicode_NONCOMPACT_DATA(op))
488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200489/* In the access macros below, "kind" may be evaluated more than once.
490 All other macro parameters are evaluated exactly once, so it is safe
491 to put side effects into them (such as increasing the index). */
492
493/* Write into the canonical representation, this macro does not do any sanity
494 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200495 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200497 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498#define PyUnicode_WRITE(kind, data, index, value) \
499 do { \
500 switch ((kind)) { \
501 case PyUnicode_1BYTE_KIND: { \
502 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
503 break; \
504 } \
505 case PyUnicode_2BYTE_KIND: { \
506 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
507 break; \
508 } \
509 default: { \
510 assert((kind) == PyUnicode_4BYTE_KIND); \
511 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
512 } \
513 } \
514 } while (0)
515
Georg Brandl07de3252011-10-05 16:47:38 +0200516/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200517 or ready calls are performed. */
518#define PyUnicode_READ(kind, data, index) \
519 ((Py_UCS4) \
520 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200521 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 ((kind) == PyUnicode_2BYTE_KIND ? \
523 ((const Py_UCS2 *)(data))[(index)] : \
524 ((const Py_UCS4 *)(data))[(index)] \
525 ) \
526 ))
527
528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
529 calls PyUnicode_KIND() and might call it twice. For single reads, use
530 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
531 cache kind and use PyUnicode_READ instead. */
532#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200533 (assert(PyUnicode_Check(unicode)), \
534 assert(PyUnicode_IS_READY(unicode)), \
535 (Py_UCS4) \
536 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
537 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
538 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
539 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
540 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
541 ) \
542 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543
544/* Returns the length of the unicode string. The caller has to make sure that
545 the string has it's canonical representation set before calling
546 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
547#define PyUnicode_GET_LENGTH(op) \
548 (assert(PyUnicode_Check(op)), \
549 assert(PyUnicode_IS_READY(op)), \
550 ((PyASCIIObject *)(op))->length)
551
552
553/* Fast check to determine whether an object is ready. Equivalent to
554 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
555
556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
557
Victor Stinnera3b334d2011-10-03 13:53:37 +0200558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200560 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 Returns 0 on success and -1 on errors. */
562#define PyUnicode_READY(op) \
563 (assert(PyUnicode_Check(op)), \
564 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200565 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567/* Return a maximum character value which is suitable for creating another
568 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200569 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570#define PyUnicode_MAX_CHAR_VALUE(op) \
571 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200572 (PyUnicode_IS_ASCII(op) ? \
573 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200575 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200576 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200577 (0xffffU) : \
578 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000580#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000581
582/* --- Constants ---------------------------------------------------------- */
583
584/* This Unicode character will be used as replacement character during
585 decoding if the errors argument is set to "replace". Note: the
586 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
587 Unicode 3.0. */
588
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000590
591/* === Public API ========================================================= */
592
593/* --- Plain Py_UNICODE --------------------------------------------------- */
594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200595/* With PEP 393, this is the recommended way to allocate a new unicode object.
596 This function will allocate the object and its buffer in a single memory
597 block. Objects created using this function are not resizable. */
598#ifndef Py_LIMITED_API
599PyAPI_FUNC(PyObject*) PyUnicode_New(
600 Py_ssize_t size, /* Number of code points in the new string */
601 Py_UCS4 maxchar /* maximum code point value in the string */
602 );
603#endif
604
Victor Stinnerd8f65102011-09-29 19:43:17 +0200605/* Initializes the canonical string representation from a the deprecated
606 wstr/Py_UNICODE representation. This function is used to convert Unicode
607 objects which were created using the old API to the new flexible format
608 introduced with PEP 393.
609
610 Don't call this function directly, use the public PyUnicode_READY() macro
611 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612#ifndef Py_LIMITED_API
613PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200614 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615 );
616#endif
617
Victor Stinner034f6cf2011-09-30 02:26:44 +0200618/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100619#ifndef Py_LIMITED_API
620PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200621 PyObject *unicode
622 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100623#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625/* Copy character from one unicode object into another, this function performs
Victor Stinner3fe55312012-01-04 00:33:50 +0100626 character conversion when necessary and falls back to memcpy() if possible.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200627
Victor Stinner3fe55312012-01-04 00:33:50 +0100628 Fail if to is too small (smaller than *how_many* or smaller than
Victor Stinnera0702ab2011-09-29 14:14:38 +0200629 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
Victor Stinner3fe55312012-01-04 00:33:50 +0100630 kind(to), or if *to* has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200631
632 Return the number of written character, or return -1 and raise an exception
633 on error.
634
635 Pseudo-code:
636
637 how_many = min(how_many, len(from) - from_start)
638 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
639 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200640
641 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200642 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 PyObject *to,
646 Py_ssize_t to_start,
647 PyObject *from,
648 Py_ssize_t from_start,
649 Py_ssize_t how_many
650 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200651
652/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
653 may crash if parameters are invalid (e.g. if the output string
654 is too short). */
655PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
656 PyObject *to,
657 Py_ssize_t to_start,
658 PyObject *from,
659 Py_ssize_t from_start,
660 Py_ssize_t how_many
661 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662#endif
663
Victor Stinnerd3f08822012-05-29 12:57:52 +0200664#ifndef Py_LIMITED_API
Victor Stinner3fe55312012-01-04 00:33:50 +0100665/* Fill a string with a character: write fill_char into
666 unicode[start:start+length].
667
668 Fail if fill_char is bigger than the string maximum character, or if the
669 string has more than 1 reference.
670
671 Return the number of written character, or return -1 and raise an exception
672 on error. */
Victor Stinner3fe55312012-01-04 00:33:50 +0100673PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
674 PyObject *unicode,
675 Py_ssize_t start,
676 Py_ssize_t length,
677 Py_UCS4 fill_char
678 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200679
680/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
681 if parameters are invalid (e.g. if length is longer than the string). */
682PyAPI_FUNC(void) _PyUnicode_FastFill(
683 PyObject *unicode,
684 Py_ssize_t start,
685 Py_ssize_t length,
686 Py_UCS4 fill_char
687 );
Victor Stinner3fe55312012-01-04 00:33:50 +0100688#endif
689
Guido van Rossumd8225182000-03-10 22:33:05 +0000690/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000692
693 u may be NULL which causes the contents to be undefined. It is the
694 user's responsibility to fill in the needed data afterwards. Note
695 that modifying the Unicode object contents after construction is
696 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000697
698 The buffer is copied into the new object. */
699
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000700#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000701PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000702 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000703 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000704 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000705#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000706
Georg Brandl952867a2010-06-27 10:17:12 +0000707/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000709 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000710 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711 );
712
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000713/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000715PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000716 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000717 );
718
Victor Stinnerd3f08822012-05-29 12:57:52 +0200719#ifndef Py_LIMITED_API
Victor Stinnerb9275c12011-10-05 14:01:42 +0200720/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
721 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
723 int kind,
724 const void *buffer,
725 Py_ssize_t size);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200726
727/* Create a new string from a buffer of ASCII characters.
728 WARNING: Don't check if the string contains any non-ASCII character. */
729PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
730 const char *buffer,
731 Py_ssize_t size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732#endif
733
734PyAPI_FUNC(PyObject*) PyUnicode_Substring(
735 PyObject *str,
736 Py_ssize_t start,
737 Py_ssize_t end);
738
Victor Stinnerece58de2012-04-23 23:36:38 +0200739#ifndef Py_LIMITED_API
740/* Compute the maximum character of the substring unicode[start:end].
741 Return 127 for an empty string. */
742PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
743 PyObject *unicode,
744 Py_ssize_t start,
745 Py_ssize_t end);
746#endif
747
Georg Brandldb6c7f52011-10-07 11:19:11 +0200748/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749 is set. Return NULL and raise an exception on error. Raise a ValueError if
750 the buffer is smaller than the string. Return buffer on success.
751
752 buflen is the length of the buffer in (Py_UCS4) characters. */
753PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
754 PyObject *unicode,
755 Py_UCS4* buffer,
756 Py_ssize_t buflen,
757 int copy_null);
758
759/* Copy the string into a UCS4 buffer. A new buffer is allocated using
760 * PyMem_Malloc; if this fails, NULL is returned with a memory error
761 exception set. */
762PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
763
Guido van Rossumd8225182000-03-10 22:33:05 +0000764/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200765 Py_UNICODE buffer.
766 If the wchar_t/Py_UNICODE representation is not yet available, this
767 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000768
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000769#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000770PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000772 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000773#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200775/* Return a read-only pointer to the Unicode object's internal
776 Py_UNICODE buffer and save the length at size.
777 If the wchar_t/Py_UNICODE representation is not yet available, this
778 function will calculate it. */
779
780#ifndef Py_LIMITED_API
781PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
782 PyObject *unicode, /* Unicode object */
783 Py_ssize_t *size /* location where to save the length */
784 );
785#endif
786
Guido van Rossumd8225182000-03-10 22:33:05 +0000787/* Get the length of the Unicode object. */
788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
790 PyObject *unicode
791);
792
Victor Stinner157f83f2011-09-28 21:41:31 +0200793/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200794 string representation. */
795
Martin v. Löwis18e16552006-02-15 17:27:45 +0000796PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000798 );
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800/* Read a character from the string. */
801
802PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
803 PyObject *unicode,
804 Py_ssize_t index
805 );
806
807/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200808 PyUnicode_New, must not be shared, and must not have been hashed yet.
809
810 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811
812PyAPI_FUNC(int) PyUnicode_WriteChar(
813 PyObject *unicode,
814 Py_ssize_t index,
815 Py_UCS4 character
816 );
817
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000818#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000819/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000820PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000822
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100823/* Resize an Unicode object. The length is the number of characters, except
824 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
825 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000826
827 *unicode is modified to point to the new (resized) object and 0
828 returned on success.
829
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100830 Try to resize the string in place (which is usually faster than allocating
831 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000832
833 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100834 is returned and *unicode left untouched.
835
836 WARNING: The function doesn't check string content, the result may not be a
837 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000838
Mark Hammond91a681d2002-08-12 07:21:58 +0000839PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000840 PyObject **unicode, /* Pointer to the Unicode object */
841 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000842 );
843
Guido van Rossumd8225182000-03-10 22:33:05 +0000844/* Coerce obj to an Unicode object and return a reference with
845 *incremented* refcount.
846
847 Coercion is done in the following way:
848
Georg Brandl952867a2010-06-27 10:17:12 +0000849 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000850 under the assumptions that they contain data using the UTF-8
851 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000852
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000853 2. All other objects (including Unicode objects) raise an
854 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000855
856 The API returns NULL in case of an error. The caller is responsible
857 for decref'ing the returned objects.
858
859*/
860
Mark Hammond91a681d2002-08-12 07:21:58 +0000861PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200862 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000863 const char *encoding, /* encoding */
864 const char *errors /* error handling */
865 );
866
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000867/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000868 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000870 Unicode objects are passed back as-is (subclasses are converted to
871 true Unicode objects), all other objects are delegated to
872 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000873 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000874
875 The API returns NULL in case of an error. The caller is responsible
876 for decref'ing the returned objects.
877
878*/
879
Mark Hammond91a681d2002-08-12 07:21:58 +0000880PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000882 );
883
Victor Stinner1205f272010-09-11 00:54:47 +0000884PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
885 const char *format, /* ASCII-encoded string */
886 va_list vargs
887 );
888PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
889 const char *format, /* ASCII-encoded string */
890 ...
891 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000893#ifndef Py_LIMITED_API
Victor Stinnerd3f08822012-05-29 12:57:52 +0200894typedef struct {
895 PyObject *buffer;
896 void *data;
897 enum PyUnicode_Kind kind;
898 Py_UCS4 maxchar;
899 Py_ssize_t size;
900 Py_ssize_t pos;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200901
902 /* minimum number of allocated characters (default: 0) */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200903 Py_ssize_t min_length;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200904
905 /* minimum character (default: 127, ASCII) */
906 Py_UCS4 min_char;
907
908 /* If non-zero, overallocate the buffer by 25% (default: 0). */
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200909 unsigned char overallocate;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200910
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200911 /* If readonly is 1, buffer is a shared string (cannot be modified)
912 and size is set to 0. */
913 unsigned char readonly;
Victor Stinnerd3f08822012-05-29 12:57:52 +0200914} _PyUnicodeWriter ;
915
916/* Initialize a Unicode writer.
Victor Stinner8f674cc2013-04-17 23:02:17 +0200917 *
918 * By default, the minimum buffer size is 0 character and overallocation is
919 * disabled. Set min_length, min_char and overallocate attributes to control
920 * the allocation of the buffer. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200921PyAPI_FUNC(void)
Victor Stinner8f674cc2013-04-17 23:02:17 +0200922_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200923
924/* Prepare the buffer to write 'length' characters
925 with the specified maximum character.
926
927 Return 0 on success, raise an exception and return -1 on error. */
928#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
929 (((MAXCHAR) <= (WRITER)->maxchar \
930 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
931 ? 0 \
932 : (((LENGTH) == 0) \
933 ? 0 \
934 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
935
936/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
937 instead. */
938PyAPI_FUNC(int)
939_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
940 Py_ssize_t length, Py_UCS4 maxchar);
941
Victor Stinnera0dd0212013-04-11 22:09:04 +0200942/* Append a Unicode character.
943 Return 0 on success, raise an exception and return -1 on error. */
944PyAPI_FUNC(int)
945_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
946 Py_UCS4 ch
947 );
948
Victor Stinnere215d962012-10-06 23:03:36 +0200949/* Append a Unicode string.
950 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200951PyAPI_FUNC(int)
Victor Stinnere215d962012-10-06 23:03:36 +0200952_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
953 PyObject *str /* Unicode string */
954 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200955
Victor Stinnercfc4c132013-04-03 01:48:39 +0200956/* Append a substring of a Unicode string.
957 Return 0 on success, raise an exception and return -1 on error. */
958PyAPI_FUNC(int)
959_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
960 PyObject *str, /* Unicode string */
961 Py_ssize_t start,
962 Py_ssize_t end
963 );
964
Victor Stinner4a587072013-11-19 12:54:53 +0100965/* Append a ASCII-encoded byte string.
966 Return 0 on success, raise an exception and return -1 on error. */
967PyAPI_FUNC(int)
968_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
969 const char *str, /* ASCII-encoded byte string */
970 Py_ssize_t len /* number of bytes, or -1 if unknown */
971 );
972
Victor Stinnere215d962012-10-06 23:03:36 +0200973/* Append a latin1-encoded byte string.
974 Return 0 on success, raise an exception and return -1 on error. */
975PyAPI_FUNC(int)
Victor Stinner4a587072013-11-19 12:54:53 +0100976_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
977 const char *str, /* latin1-encoded byte string */
978 Py_ssize_t len /* length in bytes */
Victor Stinnere215d962012-10-06 23:03:36 +0200979 );
980
Victor Stinnerf4764052013-04-18 23:21:19 +0200981/* Get the value of the writer as an Unicode string. Clear the
Victor Stinnere215d962012-10-06 23:03:36 +0200982 buffer of the writer. Raise an exception and return NULL
983 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200984PyAPI_FUNC(PyObject *)
985_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
986
Victor Stinnere215d962012-10-06 23:03:36 +0200987/* Deallocate memory of a writer (clear its internal buffer). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200988PyAPI_FUNC(void)
989_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
990#endif
991
992#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000993/* Format the object based on the format_spec, as defined in PEP 3101
994 (Advanced String Formatting). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200995PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
996 _PyUnicodeWriter *writer,
997 PyObject *obj,
998 PyObject *format_spec,
999 Py_ssize_t start,
1000 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001001#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +00001002
Walter Dörwald16807132007-05-25 13:52:07 +00001003PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1004PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001005PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1006 const char *u /* UTF-8 encoded string */
1007 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001008#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +00001009PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001010#endif
Walter Dörwald16807132007-05-25 13:52:07 +00001011
1012/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013#define PyUnicode_CHECK_INTERNED(op) \
1014 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +00001015
Guido van Rossumd8225182000-03-10 22:33:05 +00001016/* --- wchar_t support for platforms which support it --------------------- */
1017
1018#ifdef HAVE_WCHAR_H
1019
Georg Brandl952867a2010-06-27 10:17:12 +00001020/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 size.
1022
1023 The buffer is copied into the new object. */
1024
Mark Hammond91a681d2002-08-12 07:21:58 +00001025PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001026 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001028 );
1029
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +00001031 most size wchar_t characters are copied.
1032
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001033 Note that the resulting wchar_t string may or may not be
1034 0-terminated. It is the responsibility of the caller to make sure
1035 that the wchar_t string is 0-terminated in case this is required by
1036 the application.
1037
1038 Returns the number of wchar_t characters copied (excluding a
1039 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 error. */
1041
Martin v. Löwis18e16552006-02-15 17:27:45 +00001042PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001043 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001044 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001045 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001046 );
1047
Victor Stinner137c34c2010-09-29 10:25:54 +00001048/* Convert the Unicode object to a wide character string. The output string
1049 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +02001050 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +00001051
1052 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
1053 on success. On error, returns NULL, *size is undefined and raises a
1054 MemoryError. */
1055
1056PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001057 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +00001058 Py_ssize_t *size /* number of characters of the result */
1059 );
1060
Victor Stinner9f789e72011-10-01 03:57:28 +02001061#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +02001063#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064
Guido van Rossumd8225182000-03-10 22:33:05 +00001065#endif
1066
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067/* --- Unicode ordinals --------------------------------------------------- */
1068
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069/* Create a Unicode Object from the given Unicode code point ordinal.
1070
Ezio Melottie7f90372012-10-05 03:33:31 +03001071 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 raised in case it is not.
1073
1074*/
1075
Marc-André Lemburg9c329de2002-08-12 08:19:10 +00001076PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001078/* --- Free-list management ----------------------------------------------- */
1079
1080/* Clear the free list used by the Unicode implementation.
1081
1082 This can be used to release memory used for objects on the free
1083 list back to the Python memory allocator.
1084
1085*/
1086
1087PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1088
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +00001090
1091 Many of these APIs take two arguments encoding and errors. These
1092 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001093 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +00001094
Georg Brandl952867a2010-06-27 10:17:12 +00001095 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +00001096
1097 Error handling is set by errors which may also be set to NULL
1098 meaning to use the default handling defined for the codec. Default
1099 error handling for all builtin codecs is "strict" (ValueErrors are
1100 raised).
1101
1102 The codecs all use a similar interface. Only deviation from the
1103 generic ones are documented.
1104
1105*/
1106
Fred Drakecb093fe2000-05-09 19:51:53 +00001107/* --- Manage the default encoding ---------------------------------------- */
1108
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001109/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001110 Unicode object unicode and the size of the encoded representation
1111 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +00001112
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001113 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001114
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001115 This function caches the UTF-8 encoded string in the unicodeobject
1116 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 when the unicodeobject is deallocated.
1118
1119 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1120 support the previous internal function with the same behaviour.
1121
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001122 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001123 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001124
1125 *** If you need to access the Unicode object as UTF-8 bytes string,
1126 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +00001127*/
1128
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001129#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001132 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001134#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001135
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001136/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001137 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1140 in the unicodeobject.
1141
1142 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1143 support the previous internal function with the same behaviour.
1144
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001145 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001146 extracted from the returned data.
1147
1148 *** This API is for interpreter INTERNAL USE ONLY and will likely
1149 *** be removed or changed for Python 3.1.
1150
1151 *** If you need to access the Unicode object as UTF-8 bytes string,
1152 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001153
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001154*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001155
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001156#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1158#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001159#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001160
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001161/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001162
Mark Hammond91a681d2002-08-12 07:21:58 +00001163PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001164
Guido van Rossumd8225182000-03-10 22:33:05 +00001165/* --- Generic Codecs ----------------------------------------------------- */
1166
1167/* Create a Unicode object by decoding the encoded string s of the
1168 given size. */
1169
Mark Hammond91a681d2002-08-12 07:21:58 +00001170PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001171 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001172 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001173 const char *encoding, /* encoding */
1174 const char *errors /* error handling */
1175 );
1176
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001177/* Decode a Unicode object unicode and return the result as Python
1178 object. */
1179
1180PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 PyObject *unicode, /* Unicode object */
1182 const char *encoding, /* encoding */
1183 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001184 );
1185
1186/* Decode a Unicode object unicode and return the result as Unicode
1187 object. */
1188
1189PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 PyObject *unicode, /* Unicode object */
1191 const char *encoding, /* encoding */
1192 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001193 );
1194
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001196 Python string object. */
1197
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001198#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001199PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001200 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001201 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001202 const char *encoding, /* encoding */
1203 const char *errors /* error handling */
1204 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001205#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001206
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001207/* Encodes a Unicode object and returns the result as Python
1208 object. */
1209
1210PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211 PyObject *unicode, /* Unicode object */
1212 const char *encoding, /* encoding */
1213 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001214 );
1215
Guido van Rossumd8225182000-03-10 22:33:05 +00001216/* Encodes a Unicode object and returns the result as Python string
1217 object. */
1218
Mark Hammond91a681d2002-08-12 07:21:58 +00001219PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 PyObject *unicode, /* Unicode object */
1221 const char *encoding, /* encoding */
1222 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001223 );
1224
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001225/* Encodes a Unicode object and returns the result as Unicode
1226 object. */
1227
1228PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 PyObject *unicode, /* Unicode object */
1230 const char *encoding, /* encoding */
1231 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001232 );
1233
1234/* Build an encoding map. */
1235
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001236PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1237 PyObject* string /* 256 character map */
1238 );
1239
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001240/* --- UTF-7 Codecs ------------------------------------------------------- */
1241
Mark Hammond91a681d2002-08-12 07:21:58 +00001242PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 const char *string, /* UTF-7 encoded string */
1244 Py_ssize_t length, /* size of string */
1245 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001246 );
1247
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001248PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 const char *string, /* UTF-7 encoded string */
1250 Py_ssize_t length, /* size of string */
1251 const char *errors, /* error handling */
1252 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001253 );
1254
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001255#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001256PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 const Py_UNICODE *data, /* Unicode char buffer */
1258 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1259 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1260 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1261 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001262 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001263PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1264 PyObject *unicode, /* Unicode object */
1265 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1266 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1267 const char *errors /* error handling */
1268 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001269#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001270
Guido van Rossumd8225182000-03-10 22:33:05 +00001271/* --- UTF-8 Codecs ------------------------------------------------------- */
1272
Mark Hammond91a681d2002-08-12 07:21:58 +00001273PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 const char *string, /* UTF-8 encoded string */
1275 Py_ssize_t length, /* size of string */
1276 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001277 );
1278
Walter Dörwald69652032004-09-07 20:24:22 +00001279PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 const char *string, /* UTF-8 encoded string */
1281 Py_ssize_t length, /* size of string */
1282 const char *errors, /* error handling */
1283 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001284 );
1285
Mark Hammond91a681d2002-08-12 07:21:58 +00001286PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001288 );
1289
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001290#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1292 PyObject *unicode,
1293 const char *errors);
1294
Mark Hammond91a681d2002-08-12 07:21:58 +00001295PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 const Py_UNICODE *data, /* Unicode char buffer */
1297 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1298 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001299 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001300#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001301
Walter Dörwald41980ca2007-08-16 21:55:45 +00001302/* --- UTF-32 Codecs ------------------------------------------------------ */
1303
1304/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1305 the corresponding Unicode object.
1306
1307 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001309
1310 If byteorder is non-NULL, the decoder starts decoding using the
1311 given byte order:
1312
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 *byteorder == -1: little endian
1314 *byteorder == 0: native order
1315 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001316
1317 In native mode, the first four bytes of the stream are checked for a
1318 BOM mark. If found, the BOM mark is analysed, the byte order
1319 adjusted and the BOM skipped. In the other modes, no BOM mark
1320 interpretation is done. After completion, *byteorder is set to the
1321 current byte order at the end of input data.
1322
1323 If byteorder is NULL, the codec starts in native order mode.
1324
1325*/
1326
1327PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 const char *string, /* UTF-32 encoded string */
1329 Py_ssize_t length, /* size of string */
1330 const char *errors, /* error handling */
1331 int *byteorder /* pointer to byteorder to use
1332 0=native;-1=LE,1=BE; updated on
1333 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001334 );
1335
1336PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 const char *string, /* UTF-32 encoded string */
1338 Py_ssize_t length, /* size of string */
1339 const char *errors, /* error handling */
1340 int *byteorder, /* pointer to byteorder to use
1341 0=native;-1=LE,1=BE; updated on
1342 exit */
1343 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001344 );
1345
1346/* Returns a Python string using the UTF-32 encoding in native byte
1347 order. The string always starts with a BOM mark. */
1348
1349PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001351 );
1352
1353/* Returns a Python string object holding the UTF-32 encoded value of
1354 the Unicode data.
1355
1356 If byteorder is not 0, output is written according to the following
1357 byte order:
1358
1359 byteorder == -1: little endian
1360 byteorder == 0: native byte order (writes a BOM mark)
1361 byteorder == 1: big endian
1362
1363 If byteorder is 0, the output string will always start with the
1364 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1365 prepended.
1366
1367*/
1368
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001369#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001370PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 const Py_UNICODE *data, /* Unicode char buffer */
1372 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1373 const char *errors, /* error handling */
1374 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001375 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001376PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1377 PyObject *object, /* Unicode object */
1378 const char *errors, /* error handling */
1379 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1380 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001381#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001382
Guido van Rossumd8225182000-03-10 22:33:05 +00001383/* --- UTF-16 Codecs ------------------------------------------------------ */
1384
Guido van Rossum9e896b32000-04-05 20:11:21 +00001385/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001386 the corresponding Unicode object.
1387
1388 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001390
1391 If byteorder is non-NULL, the decoder starts decoding using the
1392 given byte order:
1393
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 *byteorder == -1: little endian
1395 *byteorder == 0: native order
1396 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001397
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001398 In native mode, the first two bytes of the stream are checked for a
1399 BOM mark. If found, the BOM mark is analysed, the byte order
1400 adjusted and the BOM skipped. In the other modes, no BOM mark
1401 interpretation is done. After completion, *byteorder is set to the
1402 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001403
1404 If byteorder is NULL, the codec starts in native order mode.
1405
1406*/
1407
Mark Hammond91a681d2002-08-12 07:21:58 +00001408PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 const char *string, /* UTF-16 encoded string */
1410 Py_ssize_t length, /* size of string */
1411 const char *errors, /* error handling */
1412 int *byteorder /* pointer to byteorder to use
1413 0=native;-1=LE,1=BE; updated on
1414 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001415 );
1416
Walter Dörwald69652032004-09-07 20:24:22 +00001417PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 const char *string, /* UTF-16 encoded string */
1419 Py_ssize_t length, /* size of string */
1420 const char *errors, /* error handling */
1421 int *byteorder, /* pointer to byteorder to use
1422 0=native;-1=LE,1=BE; updated on
1423 exit */
1424 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001425 );
1426
Guido van Rossumd8225182000-03-10 22:33:05 +00001427/* Returns a Python string using the UTF-16 encoding in native byte
1428 order. The string always starts with a BOM mark. */
1429
Mark Hammond91a681d2002-08-12 07:21:58 +00001430PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001432 );
1433
1434/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001435 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001436
1437 If byteorder is not 0, output is written according to the following
1438 byte order:
1439
1440 byteorder == -1: little endian
1441 byteorder == 0: native byte order (writes a BOM mark)
1442 byteorder == 1: big endian
1443
1444 If byteorder is 0, the output string will always start with the
1445 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1446 prepended.
1447
1448 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1449 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001450 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001451
1452*/
1453
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001454#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001455PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001456 const Py_UNICODE *data, /* Unicode char buffer */
1457 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1458 const char *errors, /* error handling */
1459 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001460 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001461PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1462 PyObject* unicode, /* Unicode object */
1463 const char *errors, /* error handling */
1464 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1465 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001466#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001467
1468/* --- Unicode-Escape Codecs ---------------------------------------------- */
1469
Mark Hammond91a681d2002-08-12 07:21:58 +00001470PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 const char *string, /* Unicode-Escape encoded string */
1472 Py_ssize_t length, /* size of string */
1473 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001474 );
1475
Mark Hammond91a681d2002-08-12 07:21:58 +00001476PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001478 );
1479
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001480#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001481PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 const Py_UNICODE *data, /* Unicode char buffer */
1483 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001484 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001485#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001486
1487/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1488
Mark Hammond91a681d2002-08-12 07:21:58 +00001489PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 const char *string, /* Raw-Unicode-Escape encoded string */
1491 Py_ssize_t length, /* size of string */
1492 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001493 );
1494
Mark Hammond91a681d2002-08-12 07:21:58 +00001495PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001497 );
1498
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001499#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001500PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001501 const Py_UNICODE *data, /* Unicode char buffer */
1502 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001503 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001504#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001505
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001506/* --- Unicode Internal Codec ---------------------------------------------
1507
1508 Only for internal use in _codecsmodule.c */
1509
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001510#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001511PyObject *_PyUnicode_DecodeUnicodeInternal(
1512 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001513 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001514 const char *errors
1515 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001516#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001517
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001518/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001519
1520 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1521
1522*/
1523
Mark Hammond91a681d2002-08-12 07:21:58 +00001524PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 const char *string, /* Latin-1 encoded string */
1526 Py_ssize_t length, /* size of string */
1527 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001528 );
1529
Mark Hammond91a681d2002-08-12 07:21:58 +00001530PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001532 );
1533
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001534#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1536 PyObject* unicode,
1537 const char* errors);
1538
Mark Hammond91a681d2002-08-12 07:21:58 +00001539PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001540 const Py_UNICODE *data, /* Unicode char buffer */
1541 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1542 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001543 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001544#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001545
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001547
1548 Only 7-bit ASCII data is excepted. All other codes generate errors.
1549
1550*/
1551
Mark Hammond91a681d2002-08-12 07:21:58 +00001552PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001553 const char *string, /* ASCII encoded string */
1554 Py_ssize_t length, /* size of string */
1555 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001556 );
1557
Mark Hammond91a681d2002-08-12 07:21:58 +00001558PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001560 );
1561
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001562#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1564 PyObject* unicode,
1565 const char* errors);
1566
Mark Hammond91a681d2002-08-12 07:21:58 +00001567PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001568 const Py_UNICODE *data, /* Unicode char buffer */
1569 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1570 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001571 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001572#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001573
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001574/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001575
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001576 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001577
1578 Decoding mappings must map single string characters to single
1579 Unicode characters, integers (which are then interpreted as Unicode
1580 ordinals) or None (meaning "undefined mapping" and causing an
1581 error).
1582
1583 Encoding mappings must map single Unicode characters to single
1584 string characters, integers (which are then interpreted as Latin-1
1585 ordinals) or None (meaning "undefined mapping" and causing an
1586 error).
1587
1588 If a character lookup fails with a LookupError, the character is
1589 copied as-is meaning that its ordinal value will be interpreted as
1590 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1591 to contain those mappings which map characters to different code
1592 points.
1593
1594*/
1595
Mark Hammond91a681d2002-08-12 07:21:58 +00001596PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001597 const char *string, /* Encoded string */
1598 Py_ssize_t length, /* size of string */
1599 PyObject *mapping, /* character mapping
1600 (char ordinal -> unicode ordinal) */
1601 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001602 );
1603
Mark Hammond91a681d2002-08-12 07:21:58 +00001604PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001605 PyObject *unicode, /* Unicode object */
1606 PyObject *mapping /* character mapping
1607 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001608 );
1609
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001610#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001611PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001612 const Py_UNICODE *data, /* Unicode char buffer */
1613 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1614 PyObject *mapping, /* character mapping
1615 (unicode ordinal -> char ordinal) */
1616 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001617 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001618PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1619 PyObject *unicode, /* Unicode object */
1620 PyObject *mapping, /* character mapping
1621 (unicode ordinal -> char ordinal) */
1622 const char *errors /* error handling */
1623 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001624#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001625
1626/* Translate a Py_UNICODE buffer of the given length by applying a
1627 character mapping table to it and return the resulting Unicode
1628 object.
1629
1630 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001631 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001632
1633 Mapping tables may be dictionaries or sequences. Unmapped character
1634 ordinals (ones which cause a LookupError) are left untouched and
1635 are copied as-is.
1636
1637*/
1638
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001639#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001640PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 const Py_UNICODE *data, /* Unicode char buffer */
1642 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1643 PyObject *table, /* Translate table */
1644 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001645 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001646#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001647
Victor Stinner99b95382011-07-04 14:23:54 +02001648#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001649
Guido van Rossumefec1152000-03-28 02:01:15 +00001650/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001651
Mark Hammond91a681d2002-08-12 07:21:58 +00001652PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001653 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001654 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001655 const char *errors /* error handling */
1656 );
1657
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001658PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1659 const char *string, /* MBCS encoded string */
1660 Py_ssize_t length, /* size of string */
1661 const char *errors, /* error handling */
1662 Py_ssize_t *consumed /* bytes consumed */
1663 );
1664
Victor Stinner3a50e702011-10-18 21:21:00 +02001665PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1666 int code_page, /* code page number */
1667 const char *string, /* encoded string */
1668 Py_ssize_t length, /* size of string */
1669 const char *errors, /* error handling */
1670 Py_ssize_t *consumed /* bytes consumed */
1671 );
1672
Mark Hammond91a681d2002-08-12 07:21:58 +00001673PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001674 PyObject *unicode /* Unicode object */
1675 );
1676
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001677#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001678PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001679 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001680 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001681 const char *errors /* error handling */
1682 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001683#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001684
Victor Stinner3a50e702011-10-18 21:21:00 +02001685PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1686 int code_page, /* code page number */
1687 PyObject *unicode, /* Unicode object */
1688 const char *errors /* error handling */
1689 );
1690
Victor Stinner99b95382011-07-04 14:23:54 +02001691#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001692
Guido van Rossum9e896b32000-04-05 20:11:21 +00001693/* --- Decimal Encoder ---------------------------------------------------- */
1694
1695/* Takes a Unicode string holding a decimal value and writes it into
1696 an output buffer using standard ASCII digit codes.
1697
1698 The output buffer has to provide at least length+1 bytes of storage
1699 area. The output string is 0-terminated.
1700
1701 The encoder converts whitespace to ' ', decimal characters to their
1702 corresponding ASCII digit and all other Latin-1 characters except
1703 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1704 are treated as errors. This includes embedded NULL bytes.
1705
1706 Error handling is defined by the errors argument:
1707
1708 NULL or "strict": raise a ValueError
1709 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001711 "replace": replaces illegal characters with '?'
1712
1713 Returns 0 on success, -1 on failure.
1714
1715*/
1716
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001717#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001718PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 Py_UNICODE *s, /* Unicode buffer */
1720 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1721 char *output, /* Output buffer; must have size >= length */
1722 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001723 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001724#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001725
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001726/* Transforms code points that have decimal digit property to the
1727 corresponding ASCII digit code points.
1728
1729 Returns a new Unicode string on success, NULL on failure.
1730*/
1731
Georg Brandlb5503082010-12-05 11:40:48 +00001732#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001733PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1734 Py_UNICODE *s, /* Unicode buffer */
1735 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1736 );
Georg Brandlb5503082010-12-05 11:40:48 +00001737#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001738
Victor Stinner6f9568b2011-11-17 00:12:44 +01001739/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 as argument instead of a raw buffer and length. This function additionally
1741 transforms spaces to ASCII because this is what the callers in longobject,
1742 floatobject, and complexobject did anyways. */
1743
1744#ifndef Py_LIMITED_API
1745PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1746 PyObject *unicode /* Unicode object */
1747 );
1748#endif
1749
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001750/* --- Locale encoding --------------------------------------------------- */
1751
1752/* Decode a string from the current locale encoding. The decoder is strict if
1753 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1754 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1755 be decoded as a surrogate character and *surrogateescape* is not equal to
1756 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1757 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001758 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001759
1760PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1761 const char *str,
1762 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001763 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001764
1765/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1766 length using strlen(). */
1767
1768PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1769 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001770 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001771
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001772/* Encode a Unicode object to the current locale encoding. The encoder is
1773 strict is *surrogateescape* is equal to zero, otherwise the
1774 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +01001775 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001776
1777PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1778 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001779 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001780 );
1781
Martin v. Löwis011e8422009-05-05 04:43:17 +00001782/* --- File system encoding ---------------------------------------------- */
1783
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001784/* ParseTuple converter: encode str objects to bytes using
1785 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001786
1787PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1788
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001789/* ParseTuple converter: decode bytes objects to unicode using
1790 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1791
1792PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1793
Victor Stinner77c38622010-05-14 15:58:55 +00001794/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1795 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001796
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001797 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1798 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001799
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001800 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001801*/
1802
1803PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1804 const char *s /* encoded string */
1805 );
1806
Victor Stinner77c38622010-05-14 15:58:55 +00001807/* Decode a string using Py_FileSystemDefaultEncoding
1808 and the "surrogateescape" error handler.
1809
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001810 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1811 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001812*/
1813
Martin v. Löwis011e8422009-05-05 04:43:17 +00001814PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1815 const char *s, /* encoded string */
1816 Py_ssize_t size /* size */
1817 );
1818
Victor Stinnerae6265f2010-05-15 16:27:27 +00001819/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001820 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001821
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001822 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1823 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001824*/
1825
1826PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1827 PyObject *unicode
1828 );
1829
Guido van Rossumd8225182000-03-10 22:33:05 +00001830/* --- Methods & Slots ----------------------------------------------------
1831
1832 These are capable of handling Unicode objects and strings on input
1833 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001834 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001835
1836/* Concat two strings giving a new Unicode string. */
1837
Mark Hammond91a681d2002-08-12 07:21:58 +00001838PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001839 PyObject *left, /* Left string */
1840 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001841 );
1842
Walter Dörwald1ab83302007-05-18 17:15:44 +00001843/* Concat two strings and put the result in *pleft
1844 (sets *pleft to NULL on error) */
1845
1846PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001847 PyObject **pleft, /* Pointer to left string */
1848 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001849 );
1850
1851/* Concat two strings, put the result in *pleft and drop the right object
1852 (sets *pleft to NULL on error) */
1853
1854PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001855 PyObject **pleft, /* Pointer to left string */
1856 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001857 );
1858
Guido van Rossumd8225182000-03-10 22:33:05 +00001859/* Split a string giving a list of Unicode strings.
1860
1861 If sep is NULL, splitting will be done at all whitespace
1862 substrings. Otherwise, splits occur at the given separator.
1863
1864 At most maxsplit splits will be done. If negative, no limit is set.
1865
1866 Separators are not included in the resulting list.
1867
1868*/
1869
Mark Hammond91a681d2002-08-12 07:21:58 +00001870PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001871 PyObject *s, /* String to split */
1872 PyObject *sep, /* String separator */
1873 Py_ssize_t maxsplit /* Maxsplit count */
1874 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001875
1876/* Dito, but split at line breaks.
1877
1878 CRLF is considered to be one line break. Line breaks are not
1879 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001880
Mark Hammond91a681d2002-08-12 07:21:58 +00001881PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001882 PyObject *s, /* String to split */
1883 int keepends /* If true, line end markers are included */
1884 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001885
Thomas Wouters477c8d52006-05-27 19:21:47 +00001886/* Partition a string using a given separator. */
1887
1888PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001889 PyObject *s, /* String to partition */
1890 PyObject *sep /* String separator */
1891 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001892
1893/* Partition a string using a given separator, searching from the end of the
1894 string. */
1895
1896PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001897 PyObject *s, /* String to partition */
1898 PyObject *sep /* String separator */
1899 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001900
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001901/* Split a string giving a list of Unicode strings.
1902
1903 If sep is NULL, splitting will be done at all whitespace
1904 substrings. Otherwise, splits occur at the given separator.
1905
1906 At most maxsplit splits will be done. But unlike PyUnicode_Split
1907 PyUnicode_RSplit splits from the end of the string. If negative,
1908 no limit is set.
1909
1910 Separators are not included in the resulting list.
1911
1912*/
1913
1914PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001915 PyObject *s, /* String to split */
1916 PyObject *sep, /* String separator */
1917 Py_ssize_t maxsplit /* Maxsplit count */
1918 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001919
Guido van Rossumd8225182000-03-10 22:33:05 +00001920/* Translate a string by applying a character mapping table to it and
1921 return the resulting Unicode object.
1922
1923 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001924 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001925
1926 Mapping tables may be dictionaries or sequences. Unmapped character
1927 ordinals (ones which cause a LookupError) are left untouched and
1928 are copied as-is.
1929
1930*/
1931
Mark Hammond91a681d2002-08-12 07:21:58 +00001932PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001933 PyObject *str, /* String */
1934 PyObject *table, /* Translate table */
1935 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
1938/* Join a sequence of strings using the given separator and return
1939 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001940
Mark Hammond91a681d2002-08-12 07:21:58 +00001941PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001942 PyObject *separator, /* Separator string */
1943 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
1946/* Return 1 if substr matches str[start:end] at the given tail end, 0
1947 otherwise. */
1948
Martin v. Löwis18e16552006-02-15 17:27:45 +00001949PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001950 PyObject *str, /* String */
1951 PyObject *substr, /* Prefix or Suffix string */
1952 Py_ssize_t start, /* Start index */
1953 Py_ssize_t end, /* Stop index */
1954 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001955 );
1956
1957/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001958 given search direction or -1 if not found. -2 is returned in case
1959 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960
Martin v. Löwis18e16552006-02-15 17:27:45 +00001961PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001962 PyObject *str, /* String */
1963 PyObject *substr, /* Substring to find */
1964 Py_ssize_t start, /* Start index */
1965 Py_ssize_t end, /* Stop index */
1966 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001967 );
1968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969/* Like PyUnicode_Find, but search for single character only. */
1970PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1971 PyObject *str,
1972 Py_UCS4 ch,
1973 Py_ssize_t start,
1974 Py_ssize_t end,
1975 int direction
1976 );
1977
Barry Warsaw51ac5802000-03-20 16:36:48 +00001978/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001979
Martin v. Löwis18e16552006-02-15 17:27:45 +00001980PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001981 PyObject *str, /* String */
1982 PyObject *substr, /* Substring to count */
1983 Py_ssize_t start, /* Start index */
1984 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001985 );
1986
Barry Warsaw51ac5802000-03-20 16:36:48 +00001987/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001988 and return the resulting Unicode object. */
1989
Mark Hammond91a681d2002-08-12 07:21:58 +00001990PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001991 PyObject *str, /* String */
1992 PyObject *substr, /* Substring to find */
1993 PyObject *replstr, /* Substring to replace */
1994 Py_ssize_t maxcount /* Max. number of replacements to apply;
1995 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001996 );
1997
1998/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +02001999 greater than resp.
2000 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002001
Mark Hammond91a681d2002-08-12 07:21:58 +00002002PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002003 PyObject *left, /* Left string */
2004 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00002005 );
2006
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002007PyAPI_FUNC(int) _PyUnicode_CompareWithId(
2008 PyObject *left, /* Left string */
2009 _Py_Identifier *right /* Right identifier */
2010 );
2011
Martin v. Löwis5b222132007-06-10 09:51:05 +00002012PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2013 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00002014 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00002015 );
2016
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002017/* Rich compare two strings and return one of the following:
2018
2019 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02002020 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002021 - Py_NotImplemented in case the type combination is unknown
2022
2023 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
2024 case the conversion of the arguments to Unicode fails with a
2025 UnicodeDecodeError.
2026
2027 Possible values for op:
2028
2029 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2030
2031*/
2032
2033PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002034 PyObject *left, /* Left string */
2035 PyObject *right, /* Right string */
2036 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002037 );
2038
Thomas Wouters7e474022000-07-16 12:04:32 +00002039/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00002040 the resulting Unicode string. */
2041
Mark Hammond91a681d2002-08-12 07:21:58 +00002042PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002043 PyObject *format, /* Format string */
2044 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00002045 );
2046
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002047/* Checks whether element is contained in container and return 1/0
2048 accordingly.
2049
2050 element has to coerce to an one element Unicode string. -1 is
2051 returned in case of an error. */
2052
Mark Hammond91a681d2002-08-12 07:21:58 +00002053PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002054 PyObject *container, /* Container string */
2055 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002056 );
2057
Antoine Pitrou13348842012-01-29 18:36:34 +01002058/* Checks whether the string contains any NUL characters. */
2059
2060#ifndef Py_LIMITED_API
2061PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *);
2062#endif
2063
Martin v. Löwis47383402007-08-15 07:32:56 +00002064/* Checks whether argument is a valid identifier. */
2065
2066PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2067
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002068#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002069/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00002070PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002071 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002072 int striptype,
2073 PyObject *sepobj
2074 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002075#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002076
Eric Smitha3b1ac82009-04-03 14:45:06 +00002077/* Using explicit passed-in values, insert the thousands grouping
2078 into the string pointed to by buffer. For the argument descriptions,
2079 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002080#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02002082 PyObject *unicode,
Victor Stinner41a863c2012-02-24 00:37:51 +01002083 Py_ssize_t index,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 Py_ssize_t n_buffer,
2085 void *digits,
2086 Py_ssize_t n_digits,
2087 Py_ssize_t min_width,
2088 const char *grouping,
Victor Stinner41a863c2012-02-24 00:37:51 +01002089 PyObject *thousands_sep,
2090 Py_UCS4 *maxchar);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002091#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002092/* === Characters Type APIs =============================================== */
2093
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002094/* Helper array used by Py_UNICODE_ISSPACE(). */
2095
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002096#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002097PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2098
Guido van Rossumd8225182000-03-10 22:33:05 +00002099/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002100 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00002101
2102 These APIs are implemented in Objects/unicodectype.c.
2103
2104*/
2105
Mark Hammond91a681d2002-08-12 07:21:58 +00002106PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002107 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002108 );
2109
Mark Hammond91a681d2002-08-12 07:21:58 +00002110PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002111 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002112 );
2113
Mark Hammond91a681d2002-08-12 07:21:58 +00002114PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002115 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002116 );
2117
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002118PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002119 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002120 );
2121
2122PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002123 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002124 );
2125
Mark Hammond91a681d2002-08-12 07:21:58 +00002126PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002127 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002128 );
2129
Mark Hammond91a681d2002-08-12 07:21:58 +00002130PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002131 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002132 );
2133
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002134PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2135 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002136 );
2137
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002138PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2139 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002140 );
2141
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2143 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002144 );
2145
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002146PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2147 Py_UCS4 ch, /* Unicode character */
2148 Py_UCS4 *res
2149 );
2150
2151PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2152 Py_UCS4 ch, /* Unicode character */
2153 Py_UCS4 *res
2154 );
2155
2156PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2157 Py_UCS4 ch, /* Unicode character */
2158 Py_UCS4 *res
2159 );
2160
Benjamin Petersond5890c82012-01-14 13:23:30 -05002161PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2162 Py_UCS4 ch, /* Unicode character */
2163 Py_UCS4 *res
2164 );
2165
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002166PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002167 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002168 );
2169
2170PyAPI_FUNC(int) _PyUnicode_IsCased(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002171 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002172 );
2173
Mark Hammond91a681d2002-08-12 07:21:58 +00002174PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002175 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002176 );
2177
Mark Hammond91a681d2002-08-12 07:21:58 +00002178PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002179 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002180 );
2181
Mark Hammond91a681d2002-08-12 07:21:58 +00002182PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002183 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002184 );
2185
Mark Hammond91a681d2002-08-12 07:21:58 +00002186PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002187 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002188 );
2189
Mark Hammond91a681d2002-08-12 07:21:58 +00002190PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002191 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002192 );
2193
Mark Hammond91a681d2002-08-12 07:21:58 +00002194PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002195 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002196 );
2197
Georg Brandl559e5d72008-06-11 18:37:52 +00002198PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002199 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002200 );
2201
Mark Hammond91a681d2002-08-12 07:21:58 +00002202PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002203 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002204 );
2205
Victor Stinneref8d95c2010-08-16 22:03:11 +00002206PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2207 const Py_UNICODE *u
2208 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002209
2210PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002211 Py_UNICODE *s1,
2212 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002213
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002214PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2215 Py_UNICODE *s1, const Py_UNICODE *s2);
2216
Martin v. Löwis5b222132007-06-10 09:51:05 +00002217PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002218 Py_UNICODE *s1,
2219 const Py_UNICODE *s2,
2220 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002221
2222PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002223 const Py_UNICODE *s1,
2224 const Py_UNICODE *s2
2225 );
2226
2227PyAPI_FUNC(int) Py_UNICODE_strncmp(
2228 const Py_UNICODE *s1,
2229 const Py_UNICODE *s2,
2230 size_t n
2231 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002232
2233PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002234 const Py_UNICODE *s,
2235 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002236 );
2237
Victor Stinner331ea922010-08-10 16:37:20 +00002238PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002239 const Py_UNICODE *s,
2240 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002241 );
2242
Victor Stinner71133ff2010-09-01 23:43:53 +00002243/* Create a copy of a unicode string ending with a nul character. Return NULL
2244 and raise a MemoryError exception on memory allocation failure, otherwise
2245 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2246
Victor Stinner46408602010-09-03 16:18:00 +00002247PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002248 PyObject *unicode
2249 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002250#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002251
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002252#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002253PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002254 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002255 int check_content);
2256#endif
2257
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002258/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2259PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2260/* Clear all static strings. */
2261PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2262
Guido van Rossumd8225182000-03-10 22:33:05 +00002263#ifdef __cplusplus
2264}
2265#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002266#endif /* !Py_UNICODEOBJECT_H */