blob: ecf83866b6a62eeb7fc8c2e0d009e67303eb0ec2 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300194
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000195/* Check if substring matches at given offset. The offset must be
196 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000203#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000211#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214 structure. state.ascii and state.compact are set, and the data
215 immediately follow the structure. utf8_length and wstr_length can be found
216 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000217typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200218 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200219
220 - compact ascii:
221
222 * structure = PyASCIIObject
223 * kind = PyUnicode_1BYTE_KIND
224 * compact = 1
225 * ascii = 1
226 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200227 * (length is the length of the utf8 and wstr strings)
228 * (data starts just after the structure)
229 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200230
231 - compact:
232
233 * structure = PyCompactUnicodeObject
234 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
235 PyUnicode_4BYTE_KIND
236 * compact = 1
237 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200238 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200239 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200240 * utf8_length = 0 if utf8 is NULL
241 * wstr is shared with data and wstr_length=length
242 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100243 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200244 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200245 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200246
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200247 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200248
249 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100250 * length = 0 (use wstr_length)
251 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200252 * kind = PyUnicode_WCHAR_KIND
253 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200254 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200255 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100256 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200257 * wstr is not NULL
258 * data.any is NULL
259 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200260 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200261
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200262 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200263
264 * structure = PyUnicodeObject structure
265 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
266 PyUnicode_4BYTE_KIND
267 * compact = 0
268 * ready = 1
269 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200270 * utf8 is shared and utf8_length = length with data.any if ascii = 1
271 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100272 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200273 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
274 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
275 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200276
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200277 Compact strings use only one memory block (structure + characters),
278 whereas legacy strings use one block for the structure and one block
279 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200280
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200281 Legacy strings are created by PyUnicode_FromUnicode() and
282 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
283 when PyUnicode_READY() is called.
284
285 See also _PyUnicode_CheckConsistency().
286 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000287 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200288 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000289 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200290 struct {
291 /*
292 SSTATE_NOT_INTERNED (0)
293 SSTATE_INTERNED_MORTAL (1)
294 SSTATE_INTERNED_IMMORTAL (2)
295
296 If interned != SSTATE_NOT_INTERNED, the two references from the
297 dictionary to this object are *not* counted in ob_refcnt.
298 */
299 unsigned int interned:2;
300 /* Character size:
301
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302 - PyUnicode_WCHAR_KIND (0):
303
304 * character type = wchar_t (16 or 32 bits, depending on the
305 platform)
306
307 - PyUnicode_1BYTE_KIND (1):
308
309 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100310 * all characters are in the range U+0000-U+00FF (latin1)
311 * if ascii is set, all characters are in the range U+0000-U+007F
312 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200313 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200314
315 - PyUnicode_2BYTE_KIND (2):
316
317 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100318 * all characters are in the range U+0000-U+FFFF (BMP)
319 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200320
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200321 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200322
323 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100324 * all characters are in the range U+0000-U+10FFFF
325 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200326 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200327 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200328 /* Compact is with respect to the allocation scheme. Compact unicode
329 objects only require one memory block while non-compact objects use
330 one block for the PyUnicodeObject struct and another for its data
331 buffer. */
332 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100333 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200334 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
335 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200336 unsigned int ascii:1;
337 /* The ready flag indicates whether the object layout is initialized
338 completely. This means that this is either a compact object, or
339 the data pointer is filled out. The bit is redundant, and helps
340 to minimize the test in PyUnicode_IS_READY(). */
341 unsigned int ready:1;
342 } state;
343 wchar_t *wstr; /* wchar_t representation (null-terminated) */
344} PyASCIIObject;
345
346/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200347 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 immediately follow the structure. */
349typedef struct {
350 PyASCIIObject _base;
351 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
352 * terminating \0. */
353 char *utf8; /* UTF-8 representation (null-terminated) */
354 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
355 * surrogates count as two code points. */
356} PyCompactUnicodeObject;
357
358/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
359 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200360 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200361typedef struct {
362 PyCompactUnicodeObject _base;
363 union {
364 void *any;
365 Py_UCS1 *latin1;
366 Py_UCS2 *ucs2;
367 Py_UCS4 *ucs4;
368 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000369} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000370#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000371
Mark Hammond91a681d2002-08-12 07:21:58 +0000372PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000373PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000374
Thomas Wouters27d517b2007-02-25 20:39:11 +0000375#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000376 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
377#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000378
379/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000380#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381
382#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200383 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 ((PyASCIIObject*)op)->length : \
385 ((PyCompactUnicodeObject*)op)->wstr_length)
386
387/* Returns the deprecated Py_UNICODE representation's size in code units
388 (this includes surrogate pairs as 2 units).
389 If the Py_UNICODE representation is not available, it will be computed
390 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
391
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100392#define PyUnicode_GET_SIZE(op) \
393 (assert(PyUnicode_Check(op)), \
394 (((PyASCIIObject *)(op))->wstr) ? \
395 PyUnicode_WSTR_LENGTH(op) : \
396 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
397 assert(((PyASCIIObject *)(op))->wstr), \
398 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399
Guido van Rossumd8225182000-03-10 22:33:05 +0000400#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
402
403/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
404 representation on demand. Using this macro is very inefficient now,
405 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
406 use PyUnicode_WRITE() and PyUnicode_READ(). */
407
Guido van Rossumd8225182000-03-10 22:33:05 +0000408#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409 (assert(PyUnicode_Check(op)), \
410 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
411 PyUnicode_AsUnicode((PyObject *)(op)))
412
Guido van Rossumd8225182000-03-10 22:33:05 +0000413#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414 ((const char *)(PyUnicode_AS_UNICODE(op)))
415
416
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200417/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200418
Victor Stinner6f9568b2011-11-17 00:12:44 +0100419/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200420
421/* Interning state. */
422#define SSTATE_NOT_INTERNED 0
423#define SSTATE_INTERNED_MORTAL 1
424#define SSTATE_INTERNED_IMMORTAL 2
425
Victor Stinnera3b334d2011-10-03 13:53:37 +0200426/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100427 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
428 ready. */
429#define PyUnicode_IS_ASCII(op) \
430 (assert(PyUnicode_Check(op)), \
431 assert(PyUnicode_IS_READY(op)), \
432 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200433
434/* Return true if the string is compact or 0 if not.
435 No type checks or Ready calls are performed. */
436#define PyUnicode_IS_COMPACT(op) \
437 (((PyASCIIObject*)(op))->state.compact)
438
439/* Return true if the string is a compact ASCII string (use PyASCIIObject
440 structure), or 0 if not. No type checks or Ready calls are performed. */
441#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100442 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200444enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200446 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200448 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200450 PyUnicode_1BYTE_KIND = 1,
451 PyUnicode_2BYTE_KIND = 2,
452 PyUnicode_4BYTE_KIND = 4
453};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
Georg Brandl4975a9b2011-10-05 16:12:21 +0200455/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200457 No checks are performed, use PyUnicode_KIND() before to ensure
458 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459
460#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
461#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
462#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
463
Victor Stinner157f83f2011-09-28 21:41:31 +0200464/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200465#define PyUnicode_KIND(op) \
466 (assert(PyUnicode_Check(op)), \
467 assert(PyUnicode_IS_READY(op)), \
468 ((PyASCIIObject *)(op))->state.kind)
469
Victor Stinner157f83f2011-09-28 21:41:31 +0200470/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200472 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200473 ((void*)((PyASCIIObject*)(op) + 1)) : \
474 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
475
476#define _PyUnicode_NONCOMPACT_DATA(op) \
477 (assert(((PyUnicodeObject*)(op))->data.any), \
478 ((((PyUnicodeObject *)(op))->data.any)))
479
480#define PyUnicode_DATA(op) \
481 (assert(PyUnicode_Check(op)), \
482 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
483 _PyUnicode_NONCOMPACT_DATA(op))
484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200485/* In the access macros below, "kind" may be evaluated more than once.
486 All other macro parameters are evaluated exactly once, so it is safe
487 to put side effects into them (such as increasing the index). */
488
489/* Write into the canonical representation, this macro does not do any sanity
490 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200491 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200492 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200493 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200494#define PyUnicode_WRITE(kind, data, index, value) \
495 do { \
496 switch ((kind)) { \
497 case PyUnicode_1BYTE_KIND: { \
498 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
499 break; \
500 } \
501 case PyUnicode_2BYTE_KIND: { \
502 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
503 break; \
504 } \
505 default: { \
506 assert((kind) == PyUnicode_4BYTE_KIND); \
507 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
508 } \
509 } \
510 } while (0)
511
Georg Brandl07de3252011-10-05 16:47:38 +0200512/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513 or ready calls are performed. */
514#define PyUnicode_READ(kind, data, index) \
515 ((Py_UCS4) \
516 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200517 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518 ((kind) == PyUnicode_2BYTE_KIND ? \
519 ((const Py_UCS2 *)(data))[(index)] : \
520 ((const Py_UCS4 *)(data))[(index)] \
521 ) \
522 ))
523
524/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
525 calls PyUnicode_KIND() and might call it twice. For single reads, use
526 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
527 cache kind and use PyUnicode_READ instead. */
528#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200529 (assert(PyUnicode_Check(unicode)), \
530 assert(PyUnicode_IS_READY(unicode)), \
531 (Py_UCS4) \
532 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
533 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
534 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
535 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
536 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
537 ) \
538 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539
540/* Returns the length of the unicode string. The caller has to make sure that
541 the string has it's canonical representation set before calling
542 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
543#define PyUnicode_GET_LENGTH(op) \
544 (assert(PyUnicode_Check(op)), \
545 assert(PyUnicode_IS_READY(op)), \
546 ((PyASCIIObject *)(op))->length)
547
548
549/* Fast check to determine whether an object is ready. Equivalent to
550 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
551
552#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
553
Victor Stinnera3b334d2011-10-03 13:53:37 +0200554/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200556 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 Returns 0 on success and -1 on errors. */
558#define PyUnicode_READY(op) \
559 (assert(PyUnicode_Check(op)), \
560 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200561 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200563/* Return a maximum character value which is suitable for creating another
564 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200565 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566#define PyUnicode_MAX_CHAR_VALUE(op) \
567 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200568 (PyUnicode_IS_ASCII(op) ? \
569 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200571 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200573 (0xffffU) : \
574 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000576#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000577
578/* --- Constants ---------------------------------------------------------- */
579
580/* This Unicode character will be used as replacement character during
581 decoding if the errors argument is set to "replace". Note: the
582 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
583 Unicode 3.0. */
584
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200585#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000586
587/* === Public API ========================================================= */
588
589/* --- Plain Py_UNICODE --------------------------------------------------- */
590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591/* With PEP 393, this is the recommended way to allocate a new unicode object.
592 This function will allocate the object and its buffer in a single memory
593 block. Objects created using this function are not resizable. */
594#ifndef Py_LIMITED_API
595PyAPI_FUNC(PyObject*) PyUnicode_New(
596 Py_ssize_t size, /* Number of code points in the new string */
597 Py_UCS4 maxchar /* maximum code point value in the string */
598 );
599#endif
600
Victor Stinnerd8f65102011-09-29 19:43:17 +0200601/* Initializes the canonical string representation from a the deprecated
602 wstr/Py_UNICODE representation. This function is used to convert Unicode
603 objects which were created using the old API to the new flexible format
604 introduced with PEP 393.
605
606 Don't call this function directly, use the public PyUnicode_READY() macro
607 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608#ifndef Py_LIMITED_API
609PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200610 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611 );
612#endif
613
Victor Stinner034f6cf2011-09-30 02:26:44 +0200614/* Get a copy of a Unicode string. */
615PyAPI_FUNC(PyObject*) PyUnicode_Copy(
616 PyObject *unicode
617 );
618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200620 character conversion when necessary and falls back to memcpy if possible.
621
Victor Stinnera0702ab2011-09-29 14:14:38 +0200622 Fail if to is too small (smaller than how_many or smaller than
623 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
624 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200625
626 Return the number of written character, or return -1 and raise an exception
627 on error.
628
629 Pseudo-code:
630
631 how_many = min(how_many, len(from) - from_start)
632 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
633 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200634
635 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200636 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639 PyObject *to,
640 Py_ssize_t to_start,
641 PyObject *from,
642 Py_ssize_t from_start,
643 Py_ssize_t how_many
644 );
645#endif
646
Guido van Rossumd8225182000-03-10 22:33:05 +0000647/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000649
650 u may be NULL which causes the contents to be undefined. It is the
651 user's responsibility to fill in the needed data afterwards. Note
652 that modifying the Unicode object contents after construction is
653 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000654
655 The buffer is copied into the new object. */
656
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000657#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000658PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000659 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000660 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000661 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000662#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000663
Georg Brandl952867a2010-06-27 10:17:12 +0000664/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000665PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000666 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000667 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 );
669
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000670/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000672PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000673 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000674 );
675
Victor Stinnerb9275c12011-10-05 14:01:42 +0200676/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
677 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200678#ifndef Py_LIMITED_API
679PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
680 int kind,
681 const void *buffer,
682 Py_ssize_t size);
683#endif
684
685PyAPI_FUNC(PyObject*) PyUnicode_Substring(
686 PyObject *str,
687 Py_ssize_t start,
688 Py_ssize_t end);
689
Georg Brandldb6c7f52011-10-07 11:19:11 +0200690/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 is set. Return NULL and raise an exception on error. Raise a ValueError if
692 the buffer is smaller than the string. Return buffer on success.
693
694 buflen is the length of the buffer in (Py_UCS4) characters. */
695PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
696 PyObject *unicode,
697 Py_UCS4* buffer,
698 Py_ssize_t buflen,
699 int copy_null);
700
701/* Copy the string into a UCS4 buffer. A new buffer is allocated using
702 * PyMem_Malloc; if this fails, NULL is returned with a memory error
703 exception set. */
704PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
705
Guido van Rossumd8225182000-03-10 22:33:05 +0000706/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707 Py_UNICODE buffer.
708 If the wchar_t/Py_UNICODE representation is not yet available, this
709 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000710
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000711#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000712PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000714 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000715#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717/* Return a read-only pointer to the Unicode object's internal
718 Py_UNICODE buffer and save the length at size.
719 If the wchar_t/Py_UNICODE representation is not yet available, this
720 function will calculate it. */
721
722#ifndef Py_LIMITED_API
723PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
724 PyObject *unicode, /* Unicode object */
725 Py_ssize_t *size /* location where to save the length */
726 );
727#endif
728
Guido van Rossumd8225182000-03-10 22:33:05 +0000729/* Get the length of the Unicode object. */
730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200731PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
732 PyObject *unicode
733);
734
Victor Stinner157f83f2011-09-28 21:41:31 +0200735/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 string representation. */
737
Martin v. Löwis18e16552006-02-15 17:27:45 +0000738PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000740 );
741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742/* Read a character from the string. */
743
744PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
745 PyObject *unicode,
746 Py_ssize_t index
747 );
748
749/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200750 PyUnicode_New, must not be shared, and must not have been hashed yet.
751
752 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753
754PyAPI_FUNC(int) PyUnicode_WriteChar(
755 PyObject *unicode,
756 Py_ssize_t index,
757 Py_UCS4 character
758 );
759
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000760#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000761/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000762PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000763#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000764
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200765/* Resize an Unicode object allocated by the legacy API (e.g.
766 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
767 PyUnicode_New) cannot be resized by this function.
768
Victor Stinner93439992011-11-20 18:29:14 +0100769 The length is a number of characters (and not the number of Py_UNICODE characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000770
771 *unicode is modified to point to the new (resized) object and 0
772 returned on success.
773
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200774 If the refcount on the object is 1, the function resizes the string in
775 place, which is usually faster than allocating a new string (and copy
776 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000777
778 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200779 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000780
Mark Hammond91a681d2002-08-12 07:21:58 +0000781PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 PyObject **unicode, /* Pointer to the Unicode object */
783 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000784 );
785
Guido van Rossumd8225182000-03-10 22:33:05 +0000786/* Coerce obj to an Unicode object and return a reference with
787 *incremented* refcount.
788
789 Coercion is done in the following way:
790
Georg Brandl952867a2010-06-27 10:17:12 +0000791 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000792 under the assumptions that they contain data using the UTF-8
793 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000794
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000795 2. All other objects (including Unicode objects) raise an
796 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000797
798 The API returns NULL in case of an error. The caller is responsible
799 for decref'ing the returned objects.
800
801*/
802
Mark Hammond91a681d2002-08-12 07:21:58 +0000803PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000805 const char *encoding, /* encoding */
806 const char *errors /* error handling */
807 );
808
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000809/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000810 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000812 Unicode objects are passed back as-is (subclasses are converted to
813 true Unicode objects), all other objects are delegated to
814 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000815 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000816
817 The API returns NULL in case of an error. The caller is responsible
818 for decref'ing the returned objects.
819
820*/
821
Mark Hammond91a681d2002-08-12 07:21:58 +0000822PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000824 );
825
Victor Stinner1205f272010-09-11 00:54:47 +0000826PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
827 const char *format, /* ASCII-encoded string */
828 va_list vargs
829 );
830PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
831 const char *format, /* ASCII-encoded string */
832 ...
833 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000834
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000836/* Format the object based on the format_spec, as defined in PEP 3101
837 (Advanced String Formatting). */
838PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 PyObject *format_spec,
840 Py_ssize_t start,
841 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000842#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000843
Walter Dörwald16807132007-05-25 13:52:07 +0000844PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
845PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000846PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
847 const char *u /* UTF-8 encoded string */
848 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000849#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000850PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000851#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000852
853/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854#define PyUnicode_CHECK_INTERNED(op) \
855 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000856
Guido van Rossumd8225182000-03-10 22:33:05 +0000857/* --- wchar_t support for platforms which support it --------------------- */
858
859#ifdef HAVE_WCHAR_H
860
Georg Brandl952867a2010-06-27 10:17:12 +0000861/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000862 size.
863
864 The buffer is copied into the new object. */
865
Mark Hammond91a681d2002-08-12 07:21:58 +0000866PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000868 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000869 );
870
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000871/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000872 most size wchar_t characters are copied.
873
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000874 Note that the resulting wchar_t string may or may not be
875 0-terminated. It is the responsibility of the caller to make sure
876 that the wchar_t string is 0-terminated in case this is required by
877 the application.
878
879 Returns the number of wchar_t characters copied (excluding a
880 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000881 error. */
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000884 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000885 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000886 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000887 );
888
Victor Stinner137c34c2010-09-29 10:25:54 +0000889/* Convert the Unicode object to a wide character string. The output string
890 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200891 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000892
893 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
894 on success. On error, returns NULL, *size is undefined and raises a
895 MemoryError. */
896
897PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000898 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000899 Py_ssize_t *size /* number of characters of the result */
900 );
901
Victor Stinner9f789e72011-10-01 03:57:28 +0200902#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200904#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905
Guido van Rossumd8225182000-03-10 22:33:05 +0000906#endif
907
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000908/* --- Unicode ordinals --------------------------------------------------- */
909
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910/* Create a Unicode Object from the given Unicode code point ordinal.
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912 The ordinal must be in range(0x10000) on narrow Python builds
913 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
914 raised in case it is not.
915
916*/
917
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000918PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000920/* --- Free-list management ----------------------------------------------- */
921
922/* Clear the free list used by the Unicode implementation.
923
924 This can be used to release memory used for objects on the free
925 list back to the Python memory allocator.
926
927*/
928
929PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
930
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000932
933 Many of these APIs take two arguments encoding and errors. These
934 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000935 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000936
Georg Brandl952867a2010-06-27 10:17:12 +0000937 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000938
939 Error handling is set by errors which may also be set to NULL
940 meaning to use the default handling defined for the codec. Default
941 error handling for all builtin codecs is "strict" (ValueErrors are
942 raised).
943
944 The codecs all use a similar interface. Only deviation from the
945 generic ones are documented.
946
947*/
948
Fred Drakecb093fe2000-05-09 19:51:53 +0000949/* --- Manage the default encoding ---------------------------------------- */
950
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000951/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000952 Unicode object unicode and the size of the encoded representation
953 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000954
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000955 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000956
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200957 This function caches the UTF-8 encoded string in the unicodeobject
958 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 when the unicodeobject is deallocated.
960
961 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
962 support the previous internal function with the same behaviour.
963
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000964 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000965 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000966
967 *** If you need to access the Unicode object as UTF-8 bytes string,
968 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000969*/
970
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000971#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000973 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000974 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000976#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000977
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000978/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000979 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
982 in the unicodeobject.
983
984 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
985 support the previous internal function with the same behaviour.
986
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000987 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000988 extracted from the returned data.
989
990 *** This API is for interpreter INTERNAL USE ONLY and will likely
991 *** be removed or changed for Python 3.1.
992
993 *** If you need to access the Unicode object as UTF-8 bytes string,
994 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000995
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000996*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000997
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000998#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1000#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001001#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001002
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001003/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001004
Mark Hammond91a681d2002-08-12 07:21:58 +00001005PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001006
Guido van Rossumd8225182000-03-10 22:33:05 +00001007/* --- Generic Codecs ----------------------------------------------------- */
1008
1009/* Create a Unicode object by decoding the encoded string s of the
1010 given size. */
1011
Mark Hammond91a681d2002-08-12 07:21:58 +00001012PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001013 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001014 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001015 const char *encoding, /* encoding */
1016 const char *errors /* error handling */
1017 );
1018
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001019/* Decode a Unicode object unicode and return the result as Python
1020 object. */
1021
1022PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 PyObject *unicode, /* Unicode object */
1024 const char *encoding, /* encoding */
1025 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001026 );
1027
1028/* Decode a Unicode object unicode and return the result as Unicode
1029 object. */
1030
1031PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001032 PyObject *unicode, /* Unicode object */
1033 const char *encoding, /* encoding */
1034 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001035 );
1036
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001038 Python string object. */
1039
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001040#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001041PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001043 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001044 const char *encoding, /* encoding */
1045 const char *errors /* error handling */
1046 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001047#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001048
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001049/* Encodes a Unicode object and returns the result as Python
1050 object. */
1051
1052PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053 PyObject *unicode, /* Unicode object */
1054 const char *encoding, /* encoding */
1055 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001056 );
1057
Guido van Rossumd8225182000-03-10 22:33:05 +00001058/* Encodes a Unicode object and returns the result as Python string
1059 object. */
1060
Mark Hammond91a681d2002-08-12 07:21:58 +00001061PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 PyObject *unicode, /* Unicode object */
1063 const char *encoding, /* encoding */
1064 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001065 );
1066
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001067/* Encodes a Unicode object and returns the result as Unicode
1068 object. */
1069
1070PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 PyObject *unicode, /* Unicode object */
1072 const char *encoding, /* encoding */
1073 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001074 );
1075
1076/* Build an encoding map. */
1077
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001078PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1079 PyObject* string /* 256 character map */
1080 );
1081
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082/* --- UTF-7 Codecs ------------------------------------------------------- */
1083
Mark Hammond91a681d2002-08-12 07:21:58 +00001084PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 const char *string, /* UTF-7 encoded string */
1086 Py_ssize_t length, /* size of string */
1087 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088 );
1089
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001090PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 const char *string, /* UTF-7 encoded string */
1092 Py_ssize_t length, /* size of string */
1093 const char *errors, /* error handling */
1094 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001095 );
1096
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001097#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001098PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 const Py_UNICODE *data, /* Unicode char buffer */
1100 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1101 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1102 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1103 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001104 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001105PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1106 PyObject *unicode, /* Unicode object */
1107 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1108 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1109 const char *errors /* error handling */
1110 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001111#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001112
Guido van Rossumd8225182000-03-10 22:33:05 +00001113/* --- UTF-8 Codecs ------------------------------------------------------- */
1114
Mark Hammond91a681d2002-08-12 07:21:58 +00001115PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 const char *string, /* UTF-8 encoded string */
1117 Py_ssize_t length, /* size of string */
1118 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001119 );
1120
Walter Dörwald69652032004-09-07 20:24:22 +00001121PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 const char *string, /* UTF-8 encoded string */
1123 Py_ssize_t length, /* size of string */
1124 const char *errors, /* error handling */
1125 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001126 );
1127
Mark Hammond91a681d2002-08-12 07:21:58 +00001128PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001130 );
1131
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001132#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1134 PyObject *unicode,
1135 const char *errors);
1136
Mark Hammond91a681d2002-08-12 07:21:58 +00001137PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001138 const Py_UNICODE *data, /* Unicode char buffer */
1139 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1140 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001141 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001142#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001143
Walter Dörwald41980ca2007-08-16 21:55:45 +00001144/* --- UTF-32 Codecs ------------------------------------------------------ */
1145
1146/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1147 the corresponding Unicode object.
1148
1149 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001150 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001151
1152 If byteorder is non-NULL, the decoder starts decoding using the
1153 given byte order:
1154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 *byteorder == -1: little endian
1156 *byteorder == 0: native order
1157 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001158
1159 In native mode, the first four bytes of the stream are checked for a
1160 BOM mark. If found, the BOM mark is analysed, the byte order
1161 adjusted and the BOM skipped. In the other modes, no BOM mark
1162 interpretation is done. After completion, *byteorder is set to the
1163 current byte order at the end of input data.
1164
1165 If byteorder is NULL, the codec starts in native order mode.
1166
1167*/
1168
1169PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 const char *string, /* UTF-32 encoded string */
1171 Py_ssize_t length, /* size of string */
1172 const char *errors, /* error handling */
1173 int *byteorder /* pointer to byteorder to use
1174 0=native;-1=LE,1=BE; updated on
1175 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001176 );
1177
1178PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 const char *string, /* UTF-32 encoded string */
1180 Py_ssize_t length, /* size of string */
1181 const char *errors, /* error handling */
1182 int *byteorder, /* pointer to byteorder to use
1183 0=native;-1=LE,1=BE; updated on
1184 exit */
1185 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001186 );
1187
1188/* Returns a Python string using the UTF-32 encoding in native byte
1189 order. The string always starts with a BOM mark. */
1190
1191PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001192 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001193 );
1194
1195/* Returns a Python string object holding the UTF-32 encoded value of
1196 the Unicode data.
1197
1198 If byteorder is not 0, output is written according to the following
1199 byte order:
1200
1201 byteorder == -1: little endian
1202 byteorder == 0: native byte order (writes a BOM mark)
1203 byteorder == 1: big endian
1204
1205 If byteorder is 0, the output string will always start with the
1206 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1207 prepended.
1208
1209*/
1210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001211#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001212PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001213 const Py_UNICODE *data, /* Unicode char buffer */
1214 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1215 const char *errors, /* error handling */
1216 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001217 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001218PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1219 PyObject *object, /* Unicode object */
1220 const char *errors, /* error handling */
1221 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1222 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001223#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001224
Guido van Rossumd8225182000-03-10 22:33:05 +00001225/* --- UTF-16 Codecs ------------------------------------------------------ */
1226
Guido van Rossum9e896b32000-04-05 20:11:21 +00001227/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001228 the corresponding Unicode object.
1229
1230 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001232
1233 If byteorder is non-NULL, the decoder starts decoding using the
1234 given byte order:
1235
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001236 *byteorder == -1: little endian
1237 *byteorder == 0: native order
1238 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001240 In native mode, the first two bytes of the stream are checked for a
1241 BOM mark. If found, the BOM mark is analysed, the byte order
1242 adjusted and the BOM skipped. In the other modes, no BOM mark
1243 interpretation is done. After completion, *byteorder is set to the
1244 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001245
1246 If byteorder is NULL, the codec starts in native order mode.
1247
1248*/
1249
Mark Hammond91a681d2002-08-12 07:21:58 +00001250PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 const char *string, /* UTF-16 encoded string */
1252 Py_ssize_t length, /* size of string */
1253 const char *errors, /* error handling */
1254 int *byteorder /* pointer to byteorder to use
1255 0=native;-1=LE,1=BE; updated on
1256 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001257 );
1258
Walter Dörwald69652032004-09-07 20:24:22 +00001259PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 const char *string, /* UTF-16 encoded string */
1261 Py_ssize_t length, /* size of string */
1262 const char *errors, /* error handling */
1263 int *byteorder, /* pointer to byteorder to use
1264 0=native;-1=LE,1=BE; updated on
1265 exit */
1266 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001267 );
1268
Guido van Rossumd8225182000-03-10 22:33:05 +00001269/* Returns a Python string using the UTF-16 encoding in native byte
1270 order. The string always starts with a BOM mark. */
1271
Mark Hammond91a681d2002-08-12 07:21:58 +00001272PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001274 );
1275
1276/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001277 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001278
1279 If byteorder is not 0, output is written according to the following
1280 byte order:
1281
1282 byteorder == -1: little endian
1283 byteorder == 0: native byte order (writes a BOM mark)
1284 byteorder == 1: big endian
1285
1286 If byteorder is 0, the output string will always start with the
1287 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1288 prepended.
1289
1290 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1291 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001292 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001293
1294*/
1295
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001296#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001297PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 const Py_UNICODE *data, /* Unicode char buffer */
1299 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1300 const char *errors, /* error handling */
1301 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001303PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1304 PyObject* unicode, /* Unicode object */
1305 const char *errors, /* error handling */
1306 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1307 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001308#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001309
1310/* --- Unicode-Escape Codecs ---------------------------------------------- */
1311
Mark Hammond91a681d2002-08-12 07:21:58 +00001312PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 const char *string, /* Unicode-Escape encoded string */
1314 Py_ssize_t length, /* size of string */
1315 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001316 );
1317
Mark Hammond91a681d2002-08-12 07:21:58 +00001318PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001320 );
1321
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001323PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324 const Py_UNICODE *data, /* Unicode char buffer */
1325 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001327#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001328
1329/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1330
Mark Hammond91a681d2002-08-12 07:21:58 +00001331PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 const char *string, /* Raw-Unicode-Escape encoded string */
1333 Py_ssize_t length, /* size of string */
1334 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001335 );
1336
Mark Hammond91a681d2002-08-12 07:21:58 +00001337PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001339 );
1340
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001341#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001342PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 const Py_UNICODE *data, /* Unicode char buffer */
1344 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001345 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001346#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001347
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001348/* --- Unicode Internal Codec ---------------------------------------------
1349
1350 Only for internal use in _codecsmodule.c */
1351
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001352#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001353PyObject *_PyUnicode_DecodeUnicodeInternal(
1354 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001356 const char *errors
1357 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001358#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001359
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001361
1362 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1363
1364*/
1365
Mark Hammond91a681d2002-08-12 07:21:58 +00001366PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 const char *string, /* Latin-1 encoded string */
1368 Py_ssize_t length, /* size of string */
1369 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001370 );
1371
Mark Hammond91a681d2002-08-12 07:21:58 +00001372PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001374 );
1375
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001376#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1378 PyObject* unicode,
1379 const char* errors);
1380
Mark Hammond91a681d2002-08-12 07:21:58 +00001381PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382 const Py_UNICODE *data, /* Unicode char buffer */
1383 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1384 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001385 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001386#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001387
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001389
1390 Only 7-bit ASCII data is excepted. All other codes generate errors.
1391
1392*/
1393
Mark Hammond91a681d2002-08-12 07:21:58 +00001394PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 const char *string, /* ASCII encoded string */
1396 Py_ssize_t length, /* size of string */
1397 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001398 );
1399
Mark Hammond91a681d2002-08-12 07:21:58 +00001400PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001402 );
1403
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001404#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1406 PyObject* unicode,
1407 const char* errors);
1408
Mark Hammond91a681d2002-08-12 07:21:58 +00001409PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 const Py_UNICODE *data, /* Unicode char buffer */
1411 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1412 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001413 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001414#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001415
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001417
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001419
1420 Decoding mappings must map single string characters to single
1421 Unicode characters, integers (which are then interpreted as Unicode
1422 ordinals) or None (meaning "undefined mapping" and causing an
1423 error).
1424
1425 Encoding mappings must map single Unicode characters to single
1426 string characters, integers (which are then interpreted as Latin-1
1427 ordinals) or None (meaning "undefined mapping" and causing an
1428 error).
1429
1430 If a character lookup fails with a LookupError, the character is
1431 copied as-is meaning that its ordinal value will be interpreted as
1432 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1433 to contain those mappings which map characters to different code
1434 points.
1435
1436*/
1437
Mark Hammond91a681d2002-08-12 07:21:58 +00001438PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001439 const char *string, /* Encoded string */
1440 Py_ssize_t length, /* size of string */
1441 PyObject *mapping, /* character mapping
1442 (char ordinal -> unicode ordinal) */
1443 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001444 );
1445
Mark Hammond91a681d2002-08-12 07:21:58 +00001446PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 PyObject *unicode, /* Unicode object */
1448 PyObject *mapping /* character mapping
1449 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001450 );
1451
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001452#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001453PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 const Py_UNICODE *data, /* Unicode char buffer */
1455 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1456 PyObject *mapping, /* character mapping
1457 (unicode ordinal -> char ordinal) */
1458 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001459 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001460PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1461 PyObject *unicode, /* Unicode object */
1462 PyObject *mapping, /* character mapping
1463 (unicode ordinal -> char ordinal) */
1464 const char *errors /* error handling */
1465 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001466#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001467
1468/* Translate a Py_UNICODE buffer of the given length by applying a
1469 character mapping table to it and return the resulting Unicode
1470 object.
1471
1472 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001474
1475 Mapping tables may be dictionaries or sequences. Unmapped character
1476 ordinals (ones which cause a LookupError) are left untouched and
1477 are copied as-is.
1478
1479*/
1480
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001481#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001482PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 const Py_UNICODE *data, /* Unicode char buffer */
1484 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1485 PyObject *table, /* Translate table */
1486 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001487 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001488#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001489
Victor Stinner99b95382011-07-04 14:23:54 +02001490#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001491
Guido van Rossumefec1152000-03-28 02:01:15 +00001492/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001493
Mark Hammond91a681d2002-08-12 07:21:58 +00001494PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001495 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001496 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001497 const char *errors /* error handling */
1498 );
1499
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001500PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1501 const char *string, /* MBCS encoded string */
1502 Py_ssize_t length, /* size of string */
1503 const char *errors, /* error handling */
1504 Py_ssize_t *consumed /* bytes consumed */
1505 );
1506
Victor Stinner3a50e702011-10-18 21:21:00 +02001507PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1508 int code_page, /* code page number */
1509 const char *string, /* encoded string */
1510 Py_ssize_t length, /* size of string */
1511 const char *errors, /* error handling */
1512 Py_ssize_t *consumed /* bytes consumed */
1513 );
1514
Mark Hammond91a681d2002-08-12 07:21:58 +00001515PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001516 PyObject *unicode /* Unicode object */
1517 );
1518
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001519#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001520PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001521 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001522 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001523 const char *errors /* error handling */
1524 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001525#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001526
Victor Stinner3a50e702011-10-18 21:21:00 +02001527PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1528 int code_page, /* code page number */
1529 PyObject *unicode, /* Unicode object */
1530 const char *errors /* error handling */
1531 );
1532
Victor Stinner99b95382011-07-04 14:23:54 +02001533#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001534
Guido van Rossum9e896b32000-04-05 20:11:21 +00001535/* --- Decimal Encoder ---------------------------------------------------- */
1536
1537/* Takes a Unicode string holding a decimal value and writes it into
1538 an output buffer using standard ASCII digit codes.
1539
1540 The output buffer has to provide at least length+1 bytes of storage
1541 area. The output string is 0-terminated.
1542
1543 The encoder converts whitespace to ' ', decimal characters to their
1544 corresponding ASCII digit and all other Latin-1 characters except
1545 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1546 are treated as errors. This includes embedded NULL bytes.
1547
1548 Error handling is defined by the errors argument:
1549
1550 NULL or "strict": raise a ValueError
1551 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001552 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001553 "replace": replaces illegal characters with '?'
1554
1555 Returns 0 on success, -1 on failure.
1556
1557*/
1558
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001559#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001560PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001561 Py_UNICODE *s, /* Unicode buffer */
1562 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1563 char *output, /* Output buffer; must have size >= length */
1564 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001565 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001566#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001567
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001568/* Transforms code points that have decimal digit property to the
1569 corresponding ASCII digit code points.
1570
1571 Returns a new Unicode string on success, NULL on failure.
1572*/
1573
Georg Brandlb5503082010-12-05 11:40:48 +00001574#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001575PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1576 Py_UNICODE *s, /* Unicode buffer */
1577 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1578 );
Georg Brandlb5503082010-12-05 11:40:48 +00001579#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001580
Victor Stinner6f9568b2011-11-17 00:12:44 +01001581/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 as argument instead of a raw buffer and length. This function additionally
1583 transforms spaces to ASCII because this is what the callers in longobject,
1584 floatobject, and complexobject did anyways. */
1585
1586#ifndef Py_LIMITED_API
1587PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1588 PyObject *unicode /* Unicode object */
1589 );
1590#endif
1591
Martin v. Löwis011e8422009-05-05 04:43:17 +00001592/* --- File system encoding ---------------------------------------------- */
1593
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001594/* ParseTuple converter: encode str objects to bytes using
1595 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001596
1597PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1598
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001599/* ParseTuple converter: decode bytes objects to unicode using
1600 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1601
1602PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1603
Victor Stinner77c38622010-05-14 15:58:55 +00001604/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1605 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001606
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001607 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1608 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001609
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001610 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001611*/
1612
1613PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1614 const char *s /* encoded string */
1615 );
1616
Victor Stinner77c38622010-05-14 15:58:55 +00001617/* Decode a string using Py_FileSystemDefaultEncoding
1618 and the "surrogateescape" error handler.
1619
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001620 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1621 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001622*/
1623
Martin v. Löwis011e8422009-05-05 04:43:17 +00001624PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1625 const char *s, /* encoded string */
1626 Py_ssize_t size /* size */
1627 );
1628
Victor Stinnerae6265f2010-05-15 16:27:27 +00001629/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001630 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001631
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001632 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1633 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001634*/
1635
1636PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1637 PyObject *unicode
1638 );
1639
Guido van Rossumd8225182000-03-10 22:33:05 +00001640/* --- Methods & Slots ----------------------------------------------------
1641
1642 These are capable of handling Unicode objects and strings on input
1643 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001644 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001645
1646/* Concat two strings giving a new Unicode string. */
1647
Mark Hammond91a681d2002-08-12 07:21:58 +00001648PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001649 PyObject *left, /* Left string */
1650 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001651 );
1652
Walter Dörwald1ab83302007-05-18 17:15:44 +00001653/* Concat two strings and put the result in *pleft
1654 (sets *pleft to NULL on error) */
1655
1656PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001657 PyObject **pleft, /* Pointer to left string */
1658 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001659 );
1660
1661/* Concat two strings, put the result in *pleft and drop the right object
1662 (sets *pleft to NULL on error) */
1663
1664PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 PyObject **pleft, /* Pointer to left string */
1666 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001667 );
1668
Guido van Rossumd8225182000-03-10 22:33:05 +00001669/* Split a string giving a list of Unicode strings.
1670
1671 If sep is NULL, splitting will be done at all whitespace
1672 substrings. Otherwise, splits occur at the given separator.
1673
1674 At most maxsplit splits will be done. If negative, no limit is set.
1675
1676 Separators are not included in the resulting list.
1677
1678*/
1679
Mark Hammond91a681d2002-08-12 07:21:58 +00001680PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001681 PyObject *s, /* String to split */
1682 PyObject *sep, /* String separator */
1683 Py_ssize_t maxsplit /* Maxsplit count */
1684 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001685
1686/* Dito, but split at line breaks.
1687
1688 CRLF is considered to be one line break. Line breaks are not
1689 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001690
Mark Hammond91a681d2002-08-12 07:21:58 +00001691PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 PyObject *s, /* String to split */
1693 int keepends /* If true, line end markers are included */
1694 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001695
Thomas Wouters477c8d52006-05-27 19:21:47 +00001696/* Partition a string using a given separator. */
1697
1698PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001699 PyObject *s, /* String to partition */
1700 PyObject *sep /* String separator */
1701 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001702
1703/* Partition a string using a given separator, searching from the end of the
1704 string. */
1705
1706PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001707 PyObject *s, /* String to partition */
1708 PyObject *sep /* String separator */
1709 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001710
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001711/* Split a string giving a list of Unicode strings.
1712
1713 If sep is NULL, splitting will be done at all whitespace
1714 substrings. Otherwise, splits occur at the given separator.
1715
1716 At most maxsplit splits will be done. But unlike PyUnicode_Split
1717 PyUnicode_RSplit splits from the end of the string. If negative,
1718 no limit is set.
1719
1720 Separators are not included in the resulting list.
1721
1722*/
1723
1724PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001725 PyObject *s, /* String to split */
1726 PyObject *sep, /* String separator */
1727 Py_ssize_t maxsplit /* Maxsplit count */
1728 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001729
Guido van Rossumd8225182000-03-10 22:33:05 +00001730/* Translate a string by applying a character mapping table to it and
1731 return the resulting Unicode object.
1732
1733 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001734 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001735
1736 Mapping tables may be dictionaries or sequences. Unmapped character
1737 ordinals (ones which cause a LookupError) are left untouched and
1738 are copied as-is.
1739
1740*/
1741
Mark Hammond91a681d2002-08-12 07:21:58 +00001742PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001743 PyObject *str, /* String */
1744 PyObject *table, /* Translate table */
1745 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001746 );
1747
1748/* Join a sequence of strings using the given separator and return
1749 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001750
Mark Hammond91a681d2002-08-12 07:21:58 +00001751PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001752 PyObject *separator, /* Separator string */
1753 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001754 );
1755
1756/* Return 1 if substr matches str[start:end] at the given tail end, 0
1757 otherwise. */
1758
Martin v. Löwis18e16552006-02-15 17:27:45 +00001759PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001760 PyObject *str, /* String */
1761 PyObject *substr, /* Prefix or Suffix string */
1762 Py_ssize_t start, /* Start index */
1763 Py_ssize_t end, /* Stop index */
1764 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001765 );
1766
1767/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001768 given search direction or -1 if not found. -2 is returned in case
1769 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001770
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001772 PyObject *str, /* String */
1773 PyObject *substr, /* Substring to find */
1774 Py_ssize_t start, /* Start index */
1775 Py_ssize_t end, /* Stop index */
1776 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001777 );
1778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779/* Like PyUnicode_Find, but search for single character only. */
1780PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1781 PyObject *str,
1782 Py_UCS4 ch,
1783 Py_ssize_t start,
1784 Py_ssize_t end,
1785 int direction
1786 );
1787
Barry Warsaw51ac5802000-03-20 16:36:48 +00001788/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001789
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001791 PyObject *str, /* String */
1792 PyObject *substr, /* Substring to count */
1793 Py_ssize_t start, /* Start index */
1794 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001795 );
1796
Barry Warsaw51ac5802000-03-20 16:36:48 +00001797/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001798 and return the resulting Unicode object. */
1799
Mark Hammond91a681d2002-08-12 07:21:58 +00001800PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001801 PyObject *str, /* String */
1802 PyObject *substr, /* Substring to find */
1803 PyObject *replstr, /* Substring to replace */
1804 Py_ssize_t maxcount /* Max. number of replacements to apply;
1805 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001806 );
1807
1808/* Compare two strings and return -1, 0, 1 for less than, equal,
1809 greater than resp. */
1810
Mark Hammond91a681d2002-08-12 07:21:58 +00001811PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001812 PyObject *left, /* Left string */
1813 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001814 );
1815
Martin v. Löwis5b222132007-06-10 09:51:05 +00001816PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1817 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001818 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001819 );
1820
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001821/* Rich compare two strings and return one of the following:
1822
1823 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001824 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001825 - Py_NotImplemented in case the type combination is unknown
1826
1827 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1828 case the conversion of the arguments to Unicode fails with a
1829 UnicodeDecodeError.
1830
1831 Possible values for op:
1832
1833 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1834
1835*/
1836
1837PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001838 PyObject *left, /* Left string */
1839 PyObject *right, /* Right string */
1840 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001841 );
1842
Thomas Wouters7e474022000-07-16 12:04:32 +00001843/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001844 the resulting Unicode string. */
1845
Mark Hammond91a681d2002-08-12 07:21:58 +00001846PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001847 PyObject *format, /* Format string */
1848 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001849 );
1850
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001851/* Checks whether element is contained in container and return 1/0
1852 accordingly.
1853
1854 element has to coerce to an one element Unicode string. -1 is
1855 returned in case of an error. */
1856
Mark Hammond91a681d2002-08-12 07:21:58 +00001857PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001858 PyObject *container, /* Container string */
1859 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001860 );
1861
Martin v. Löwis47383402007-08-15 07:32:56 +00001862/* Checks whether argument is a valid identifier. */
1863
1864PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1865
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001866#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001867/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001868PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001869 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001870 int striptype,
1871 PyObject *sepobj
1872 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001873#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001874
Eric Smith5807c412008-05-11 21:00:57 +00001875/* Using the current locale, insert the thousands grouping
1876 into the string pointed to by buffer. For the argument descriptions,
1877 see Objects/stringlib/localeutil.h */
1878
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001879#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001880PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1881 Py_ssize_t n_buffer,
1882 Py_UNICODE *digits,
1883 Py_ssize_t n_digits,
1884 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001885#endif
Eric Smith5807c412008-05-11 21:00:57 +00001886
Eric Smitha3b1ac82009-04-03 14:45:06 +00001887/* Using explicit passed-in values, insert the thousands grouping
1888 into the string pointed to by buffer. For the argument descriptions,
1889 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001890#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001892 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 int kind,
1894 void *buffer,
1895 Py_ssize_t n_buffer,
1896 void *digits,
1897 Py_ssize_t n_digits,
1898 Py_ssize_t min_width,
1899 const char *grouping,
1900 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001901#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001902/* === Characters Type APIs =============================================== */
1903
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001904/* Helper array used by Py_UNICODE_ISSPACE(). */
1905
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001906#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001907PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1908
Guido van Rossumd8225182000-03-10 22:33:05 +00001909/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001910 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001911
1912 These APIs are implemented in Objects/unicodectype.c.
1913
1914*/
1915
Mark Hammond91a681d2002-08-12 07:21:58 +00001916PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001917 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001918 );
1919
Mark Hammond91a681d2002-08-12 07:21:58 +00001920PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001921 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001922 );
1923
Mark Hammond91a681d2002-08-12 07:21:58 +00001924PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001925 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001926 );
1927
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001928PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001929 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001930 );
1931
1932PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001933 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001934 );
1935
Mark Hammond91a681d2002-08-12 07:21:58 +00001936PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001937 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001938 );
1939
Mark Hammond91a681d2002-08-12 07:21:58 +00001940PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001941 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001942 );
1943
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001944PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1945 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001946 );
1947
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001948PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1949 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001950 );
1951
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001952PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1953 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001954 );
1955
Mark Hammond91a681d2002-08-12 07:21:58 +00001956PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001957 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001958 );
1959
Mark Hammond91a681d2002-08-12 07:21:58 +00001960PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001961 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001962 );
1963
Mark Hammond91a681d2002-08-12 07:21:58 +00001964PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001965 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001966 );
1967
Mark Hammond91a681d2002-08-12 07:21:58 +00001968PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001969 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001970 );
1971
Mark Hammond91a681d2002-08-12 07:21:58 +00001972PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001973 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001974 );
1975
Mark Hammond91a681d2002-08-12 07:21:58 +00001976PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001977 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001978 );
1979
Georg Brandl559e5d72008-06-11 18:37:52 +00001980PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001981 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001982 );
1983
Mark Hammond91a681d2002-08-12 07:21:58 +00001984PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001985 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001986 );
1987
Victor Stinneref8d95c2010-08-16 22:03:11 +00001988PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1989 const Py_UNICODE *u
1990 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001991
1992PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001993 Py_UNICODE *s1,
1994 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001995
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001996PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1997 Py_UNICODE *s1, const Py_UNICODE *s2);
1998
Martin v. Löwis5b222132007-06-10 09:51:05 +00001999PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002000 Py_UNICODE *s1,
2001 const Py_UNICODE *s2,
2002 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002003
2004PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002005 const Py_UNICODE *s1,
2006 const Py_UNICODE *s2
2007 );
2008
2009PyAPI_FUNC(int) Py_UNICODE_strncmp(
2010 const Py_UNICODE *s1,
2011 const Py_UNICODE *s2,
2012 size_t n
2013 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002014
2015PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002016 const Py_UNICODE *s,
2017 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002018 );
2019
Victor Stinner331ea922010-08-10 16:37:20 +00002020PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002021 const Py_UNICODE *s,
2022 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002023 );
2024
Victor Stinner71133ff2010-09-01 23:43:53 +00002025/* Create a copy of a unicode string ending with a nul character. Return NULL
2026 and raise a MemoryError exception on memory allocation failure, otherwise
2027 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2028
Victor Stinner46408602010-09-03 16:18:00 +00002029PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002030 PyObject *unicode
2031 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002032#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002033
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002034#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002035PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002036 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002037 int check_content);
2038#endif
2039
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002040/********************* String Literals ****************************************/
2041/* This structure helps managing static strings. The basic usage goes like this:
2042 Instead of doing
2043
2044 r = PyObject_CallMethod(o, "foo", "args", ...);
2045
2046 do
2047
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002048 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002049 ...
2050 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2051
2052 PyId_foo is a static variable, either on block level or file level. On first
2053 usage, the string "foo" is interned, and the structures are linked. On interpreter
2054 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2055
2056 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002057 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002058 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2059*/
2060typedef struct _Py_Identifier {
2061 struct _Py_Identifier *next;
2062 const char* string;
2063 PyObject *object;
2064} _Py_Identifier;
2065
Martin v. Löwis87da8722011-10-09 11:54:42 +02002066#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002067#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002068
2069/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2070PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2071/* Clear all static strings. */
2072PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2073
Guido van Rossumd8225182000-03-10 22:33:05 +00002074#ifdef __cplusplus
2075}
2076#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002077#endif /* !Py_UNICODEOBJECT_H */