blob: 3109cf466748ad1e7ccc217766ab6e09992c3189 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200209 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100241 * length = 0 (use wstr_length)
242 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200243 * kind = PyUnicode_WCHAR_KIND
244 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200245 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200246 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100247 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200248 * wstr is not NULL
249 * data.any is NULL
250 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200251 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200252
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200253 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200254
255 * structure = PyUnicodeObject structure
256 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
257 PyUnicode_4BYTE_KIND
258 * compact = 0
259 * ready = 1
260 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200261 * utf8 is shared and utf8_length = length with data.any if ascii = 1
262 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100263 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200264 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
265 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
266 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200267
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200268 Compact strings use only one memory block (structure + characters),
269 whereas legacy strings use one block for the structure and one block
270 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200271
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200272 Legacy strings are created by PyUnicode_FromUnicode() and
273 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
274 when PyUnicode_READY() is called.
275
276 See also _PyUnicode_CheckConsistency().
277 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000278 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000280 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 struct {
282 /*
283 SSTATE_NOT_INTERNED (0)
284 SSTATE_INTERNED_MORTAL (1)
285 SSTATE_INTERNED_IMMORTAL (2)
286
287 If interned != SSTATE_NOT_INTERNED, the two references from the
288 dictionary to this object are *not* counted in ob_refcnt.
289 */
290 unsigned int interned:2;
291 /* Character size:
292
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200293 - PyUnicode_WCHAR_KIND (0):
294
295 * character type = wchar_t (16 or 32 bits, depending on the
296 platform)
297
298 - PyUnicode_1BYTE_KIND (1):
299
300 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200301 * if ascii is set, all characters must be in range
302 U+0000-U+007F, otherwise at least one character must be in range
303 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200304
305 - PyUnicode_2BYTE_KIND (2):
306
307 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200308 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200309
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200310 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200311
312 * character type = Py_UCS4 (32 bits, unsigned)
313 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200315 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Compact is with respect to the allocation scheme. Compact unicode
317 objects only require one memory block while non-compact objects use
318 one block for the PyUnicodeObject struct and another for its data
319 buffer. */
320 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200321 /* The string only contains characters in range U+0000-U+007F (ASCII)
322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 unsigned int ascii:1;
325 /* The ready flag indicates whether the object layout is initialized
326 completely. This means that this is either a compact object, or
327 the data pointer is filled out. The bit is redundant, and helps
328 to minimize the test in PyUnicode_IS_READY(). */
329 unsigned int ready:1;
330 } state;
331 wchar_t *wstr; /* wchar_t representation (null-terminated) */
332} PyASCIIObject;
333
334/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200335 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200336 immediately follow the structure. */
337typedef struct {
338 PyASCIIObject _base;
339 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
340 * terminating \0. */
341 char *utf8; /* UTF-8 representation (null-terminated) */
342 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
343 * surrogates count as two code points. */
344} PyCompactUnicodeObject;
345
346/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
347 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200348 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200349typedef struct {
350 PyCompactUnicodeObject _base;
351 union {
352 void *any;
353 Py_UCS1 *latin1;
354 Py_UCS2 *ucs2;
355 Py_UCS4 *ucs4;
356 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000357} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000358#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000359
Mark Hammond91a681d2002-08-12 07:21:58 +0000360PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000361PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000362
Thomas Wouters27d517b2007-02-25 20:39:11 +0000363#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000364 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
365#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000366
367/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000368#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369
370#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200371 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372 ((PyASCIIObject*)op)->length : \
373 ((PyCompactUnicodeObject*)op)->wstr_length)
374
375/* Returns the deprecated Py_UNICODE representation's size in code units
376 (this includes surrogate pairs as 2 units).
377 If the Py_UNICODE representation is not available, it will be computed
378 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
379
Guido van Rossumd8225182000-03-10 22:33:05 +0000380#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381 (assert(PyUnicode_Check(op)), \
382 (((PyASCIIObject *)(op))->wstr) ? \
383 PyUnicode_WSTR_LENGTH(op) : \
384 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
385 PyUnicode_WSTR_LENGTH(op)))
386
Guido van Rossumd8225182000-03-10 22:33:05 +0000387#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
389
390/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
391 representation on demand. Using this macro is very inefficient now,
392 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
393 use PyUnicode_WRITE() and PyUnicode_READ(). */
394
Guido van Rossumd8225182000-03-10 22:33:05 +0000395#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 (assert(PyUnicode_Check(op)), \
397 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
398 PyUnicode_AsUnicode((PyObject *)(op)))
399
Guido van Rossumd8225182000-03-10 22:33:05 +0000400#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 ((const char *)(PyUnicode_AS_UNICODE(op)))
402
403
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200404/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405
406/* Values for PyUnicodeObject.state: */
407
408/* Interning state. */
409#define SSTATE_NOT_INTERNED 0
410#define SSTATE_INTERNED_MORTAL 1
411#define SSTATE_INTERNED_IMMORTAL 2
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413/* Return true if the string contains only ASCII characters, or 0 if not. The
414 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
415 or Ready calls are performed. */
416#define PyUnicode_IS_ASCII(op) \
417 (((PyASCIIObject*)op)->state.ascii)
418
419/* Return true if the string is compact or 0 if not.
420 No type checks or Ready calls are performed. */
421#define PyUnicode_IS_COMPACT(op) \
422 (((PyASCIIObject*)(op))->state.compact)
423
424/* Return true if the string is a compact ASCII string (use PyASCIIObject
425 structure), or 0 if not. No type checks or Ready calls are performed. */
426#define PyUnicode_IS_COMPACT_ASCII(op) \
427 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200429enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200431 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200433 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200434/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200435 PyUnicode_1BYTE_KIND = 1,
436 PyUnicode_2BYTE_KIND = 2,
437 PyUnicode_4BYTE_KIND = 4
438};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439
Georg Brandl4975a9b2011-10-05 16:12:21 +0200440/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200442 No checks are performed, use PyUnicode_KIND() before to ensure
443 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444
445#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
446#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
447#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
448
Victor Stinner157f83f2011-09-28 21:41:31 +0200449/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450#define PyUnicode_KIND(op) \
451 (assert(PyUnicode_Check(op)), \
452 assert(PyUnicode_IS_READY(op)), \
453 ((PyASCIIObject *)(op))->state.kind)
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200457 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458 ((void*)((PyASCIIObject*)(op) + 1)) : \
459 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
460
461#define _PyUnicode_NONCOMPACT_DATA(op) \
462 (assert(((PyUnicodeObject*)(op))->data.any), \
463 ((((PyUnicodeObject *)(op))->data.any)))
464
465#define PyUnicode_DATA(op) \
466 (assert(PyUnicode_Check(op)), \
467 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
468 _PyUnicode_NONCOMPACT_DATA(op))
469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200470/* In the access macros below, "kind" may be evaluated more than once.
471 All other macro parameters are evaluated exactly once, so it is safe
472 to put side effects into them (such as increasing the index). */
473
474/* Write into the canonical representation, this macro does not do any sanity
475 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200476 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200478 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200479#define PyUnicode_WRITE(kind, data, index, value) \
480 do { \
481 switch ((kind)) { \
482 case PyUnicode_1BYTE_KIND: { \
483 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
484 break; \
485 } \
486 case PyUnicode_2BYTE_KIND: { \
487 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
488 break; \
489 } \
490 default: { \
491 assert((kind) == PyUnicode_4BYTE_KIND); \
492 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
493 } \
494 } \
495 } while (0)
496
Georg Brandl07de3252011-10-05 16:47:38 +0200497/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498 or ready calls are performed. */
499#define PyUnicode_READ(kind, data, index) \
500 ((Py_UCS4) \
501 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200502 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200503 ((kind) == PyUnicode_2BYTE_KIND ? \
504 ((const Py_UCS2 *)(data))[(index)] : \
505 ((const Py_UCS4 *)(data))[(index)] \
506 ) \
507 ))
508
509/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
510 calls PyUnicode_KIND() and might call it twice. For single reads, use
511 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
512 cache kind and use PyUnicode_READ instead. */
513#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200514 (assert(PyUnicode_Check(unicode)), \
515 assert(PyUnicode_IS_READY(unicode)), \
516 (Py_UCS4) \
517 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
518 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
519 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
520 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
521 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
522 ) \
523 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524
525/* Returns the length of the unicode string. The caller has to make sure that
526 the string has it's canonical representation set before calling
527 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
528#define PyUnicode_GET_LENGTH(op) \
529 (assert(PyUnicode_Check(op)), \
530 assert(PyUnicode_IS_READY(op)), \
531 ((PyASCIIObject *)(op))->length)
532
533
534/* Fast check to determine whether an object is ready. Equivalent to
535 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
536
537#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
538
Victor Stinnera3b334d2011-10-03 13:53:37 +0200539/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200540 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200541 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 Returns 0 on success and -1 on errors. */
543#define PyUnicode_READY(op) \
544 (assert(PyUnicode_Check(op)), \
545 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200546 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548/* Return a maximum character value which is suitable for creating another
549 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200550 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define PyUnicode_MAX_CHAR_VALUE(op) \
552 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200553 (PyUnicode_IS_ASCII(op) ? \
554 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200556 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200558 (0xffffU) : \
559 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000561#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000562
563/* --- Constants ---------------------------------------------------------- */
564
565/* This Unicode character will be used as replacement character during
566 decoding if the errors argument is set to "replace". Note: the
567 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
568 Unicode 3.0. */
569
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200570#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000571
572/* === Public API ========================================================= */
573
574/* --- Plain Py_UNICODE --------------------------------------------------- */
575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200576/* With PEP 393, this is the recommended way to allocate a new unicode object.
577 This function will allocate the object and its buffer in a single memory
578 block. Objects created using this function are not resizable. */
579#ifndef Py_LIMITED_API
580PyAPI_FUNC(PyObject*) PyUnicode_New(
581 Py_ssize_t size, /* Number of code points in the new string */
582 Py_UCS4 maxchar /* maximum code point value in the string */
583 );
584#endif
585
Victor Stinnerd8f65102011-09-29 19:43:17 +0200586/* Initializes the canonical string representation from a the deprecated
587 wstr/Py_UNICODE representation. This function is used to convert Unicode
588 objects which were created using the old API to the new flexible format
589 introduced with PEP 393.
590
591 Don't call this function directly, use the public PyUnicode_READY() macro
592 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593#ifndef Py_LIMITED_API
594PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200595 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596 );
597#endif
598
Victor Stinner034f6cf2011-09-30 02:26:44 +0200599/* Get a copy of a Unicode string. */
600PyAPI_FUNC(PyObject*) PyUnicode_Copy(
601 PyObject *unicode
602 );
603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200605 character conversion when necessary and falls back to memcpy if possible.
606
Victor Stinnera0702ab2011-09-29 14:14:38 +0200607 Fail if to is too small (smaller than how_many or smaller than
608 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
609 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200610
611 Return the number of written character, or return -1 and raise an exception
612 on error.
613
614 Pseudo-code:
615
616 how_many = min(how_many, len(from) - from_start)
617 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
618 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200619
620 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200623PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624 PyObject *to,
625 Py_ssize_t to_start,
626 PyObject *from,
627 Py_ssize_t from_start,
628 Py_ssize_t how_many
629 );
630#endif
631
Guido van Rossumd8225182000-03-10 22:33:05 +0000632/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000634
635 u may be NULL which causes the contents to be undefined. It is the
636 user's responsibility to fill in the needed data afterwards. Note
637 that modifying the Unicode object contents after construction is
638 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000639
640 The buffer is copied into the new object. */
641
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000642#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000643PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000644 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000645 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000646 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000647#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000648
Georg Brandl952867a2010-06-27 10:17:12 +0000649/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000650PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000651 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000652 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000653 );
654
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000655/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200656 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000657PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000658 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000659 );
660
Victor Stinnerb9275c12011-10-05 14:01:42 +0200661/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
662 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifndef Py_LIMITED_API
664PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
665 int kind,
666 const void *buffer,
667 Py_ssize_t size);
668#endif
669
670PyAPI_FUNC(PyObject*) PyUnicode_Substring(
671 PyObject *str,
672 Py_ssize_t start,
673 Py_ssize_t end);
674
Georg Brandldb6c7f52011-10-07 11:19:11 +0200675/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676 is set. Return NULL and raise an exception on error. Raise a ValueError if
677 the buffer is smaller than the string. Return buffer on success.
678
679 buflen is the length of the buffer in (Py_UCS4) characters. */
680PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
681 PyObject *unicode,
682 Py_UCS4* buffer,
683 Py_ssize_t buflen,
684 int copy_null);
685
686/* Copy the string into a UCS4 buffer. A new buffer is allocated using
687 * PyMem_Malloc; if this fails, NULL is returned with a memory error
688 exception set. */
689PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
690
Guido van Rossumd8225182000-03-10 22:33:05 +0000691/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 Py_UNICODE buffer.
693 If the wchar_t/Py_UNICODE representation is not yet available, this
694 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000695
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000696#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000697PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000699 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000700#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702/* Return a read-only pointer to the Unicode object's internal
703 Py_UNICODE buffer and save the length at size.
704 If the wchar_t/Py_UNICODE representation is not yet available, this
705 function will calculate it. */
706
707#ifndef Py_LIMITED_API
708PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
709 PyObject *unicode, /* Unicode object */
710 Py_ssize_t *size /* location where to save the length */
711 );
712#endif
713
Guido van Rossumd8225182000-03-10 22:33:05 +0000714/* Get the length of the Unicode object. */
715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
717 PyObject *unicode
718);
719
Victor Stinner157f83f2011-09-28 21:41:31 +0200720/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721 string representation. */
722
Martin v. Löwis18e16552006-02-15 17:27:45 +0000723PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000725 );
726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727/* Read a character from the string. */
728
729PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
730 PyObject *unicode,
731 Py_ssize_t index
732 );
733
734/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200735 PyUnicode_New, must not be shared, and must not have been hashed yet.
736
737 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738
739PyAPI_FUNC(int) PyUnicode_WriteChar(
740 PyObject *unicode,
741 Py_ssize_t index,
742 Py_UCS4 character
743 );
744
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000745#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000746/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000747PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000748#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000749
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200750/* Resize an Unicode object allocated by the legacy API (e.g.
751 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
752 PyUnicode_New) cannot be resized by this function.
753
754 The length is a number of Py_UNICODE characters (and not the number of code
755 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000756
757 *unicode is modified to point to the new (resized) object and 0
758 returned on success.
759
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200760 If the refcount on the object is 1, the function resizes the string in
761 place, which is usually faster than allocating a new string (and copy
762 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000763
764 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200765 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000766
Mark Hammond91a681d2002-08-12 07:21:58 +0000767PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 PyObject **unicode, /* Pointer to the Unicode object */
769 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000770 );
771
Guido van Rossumd8225182000-03-10 22:33:05 +0000772/* Coerce obj to an Unicode object and return a reference with
773 *incremented* refcount.
774
775 Coercion is done in the following way:
776
Georg Brandl952867a2010-06-27 10:17:12 +0000777 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000778 under the assumptions that they contain data using the UTF-8
779 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000780
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000781 2. All other objects (including Unicode objects) raise an
782 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000783
784 The API returns NULL in case of an error. The caller is responsible
785 for decref'ing the returned objects.
786
787*/
788
Mark Hammond91a681d2002-08-12 07:21:58 +0000789PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000791 const char *encoding, /* encoding */
792 const char *errors /* error handling */
793 );
794
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000795/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000796 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000798 Unicode objects are passed back as-is (subclasses are converted to
799 true Unicode objects), all other objects are delegated to
800 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000801 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000802
803 The API returns NULL in case of an error. The caller is responsible
804 for decref'ing the returned objects.
805
806*/
807
Mark Hammond91a681d2002-08-12 07:21:58 +0000808PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000810 );
811
Victor Stinner1205f272010-09-11 00:54:47 +0000812PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
813 const char *format, /* ASCII-encoded string */
814 va_list vargs
815 );
816PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
817 const char *format, /* ASCII-encoded string */
818 ...
819 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000822/* Format the object based on the format_spec, as defined in PEP 3101
823 (Advanced String Formatting). */
824PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 PyObject *format_spec,
826 Py_ssize_t start,
827 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000828#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000829
Walter Dörwald16807132007-05-25 13:52:07 +0000830PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
831PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000832PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
833 const char *u /* UTF-8 encoded string */
834 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000836PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000837#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000838
839/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840#define PyUnicode_CHECK_INTERNED(op) \
841 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000842
Guido van Rossumd8225182000-03-10 22:33:05 +0000843/* --- wchar_t support for platforms which support it --------------------- */
844
845#ifdef HAVE_WCHAR_H
846
Georg Brandl952867a2010-06-27 10:17:12 +0000847/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000848 size.
849
850 The buffer is copied into the new object. */
851
Mark Hammond91a681d2002-08-12 07:21:58 +0000852PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000853 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000854 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000855 );
856
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000857/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000858 most size wchar_t characters are copied.
859
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000860 Note that the resulting wchar_t string may or may not be
861 0-terminated. It is the responsibility of the caller to make sure
862 that the wchar_t string is 0-terminated in case this is required by
863 the application.
864
865 Returns the number of wchar_t characters copied (excluding a
866 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 error. */
868
Martin v. Löwis18e16552006-02-15 17:27:45 +0000869PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000870 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000871 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000872 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000873 );
874
Victor Stinner137c34c2010-09-29 10:25:54 +0000875/* Convert the Unicode object to a wide character string. The output string
876 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200877 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000878
879 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
880 on success. On error, returns NULL, *size is undefined and raises a
881 MemoryError. */
882
883PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000884 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000885 Py_ssize_t *size /* number of characters of the result */
886 );
887
Victor Stinner9f789e72011-10-01 03:57:28 +0200888#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200890#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891
Guido van Rossumd8225182000-03-10 22:33:05 +0000892#endif
893
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000894/* --- Unicode ordinals --------------------------------------------------- */
895
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896/* Create a Unicode Object from the given Unicode code point ordinal.
897
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000898 The ordinal must be in range(0x10000) on narrow Python builds
899 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
900 raised in case it is not.
901
902*/
903
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000904PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000905
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000906/* --- Free-list management ----------------------------------------------- */
907
908/* Clear the free list used by the Unicode implementation.
909
910 This can be used to release memory used for objects on the free
911 list back to the Python memory allocator.
912
913*/
914
915PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
916
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000917/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000918
919 Many of these APIs take two arguments encoding and errors. These
920 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000921 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000922
Georg Brandl952867a2010-06-27 10:17:12 +0000923 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000924
925 Error handling is set by errors which may also be set to NULL
926 meaning to use the default handling defined for the codec. Default
927 error handling for all builtin codecs is "strict" (ValueErrors are
928 raised).
929
930 The codecs all use a similar interface. Only deviation from the
931 generic ones are documented.
932
933*/
934
Fred Drakecb093fe2000-05-09 19:51:53 +0000935/* --- Manage the default encoding ---------------------------------------- */
936
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000937/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000938 Unicode object unicode and the size of the encoded representation
939 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000940
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000941 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000942
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200943 This function caches the UTF-8 encoded string in the unicodeobject
944 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945 when the unicodeobject is deallocated.
946
947 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
948 support the previous internal function with the same behaviour.
949
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000950 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000951 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000952
953 *** If you need to access the Unicode object as UTF-8 bytes string,
954 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000955*/
956
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000957#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000959 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000960 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000962#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000963
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000964/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000965 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
968 in the unicodeobject.
969
970 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
971 support the previous internal function with the same behaviour.
972
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000973 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000974 extracted from the returned data.
975
976 *** This API is for interpreter INTERNAL USE ONLY and will likely
977 *** be removed or changed for Python 3.1.
978
979 *** If you need to access the Unicode object as UTF-8 bytes string,
980 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000981
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000982*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000983
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000984#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
986#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000987#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000988
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000989/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000990
Mark Hammond91a681d2002-08-12 07:21:58 +0000991PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000992
Guido van Rossumd8225182000-03-10 22:33:05 +0000993/* --- Generic Codecs ----------------------------------------------------- */
994
995/* Create a Unicode object by decoding the encoded string s of the
996 given size. */
997
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001001 const char *encoding, /* encoding */
1002 const char *errors /* error handling */
1003 );
1004
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001005/* Decode a Unicode object unicode and return the result as Python
1006 object. */
1007
1008PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 PyObject *unicode, /* Unicode object */
1010 const char *encoding, /* encoding */
1011 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001012 );
1013
1014/* Decode a Unicode object unicode and return the result as Unicode
1015 object. */
1016
1017PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001018 PyObject *unicode, /* Unicode object */
1019 const char *encoding, /* encoding */
1020 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001021 );
1022
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 Python string object. */
1025
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001026#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001027PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001028 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001030 const char *encoding, /* encoding */
1031 const char *errors /* error handling */
1032 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001033#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001034
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001035/* Encodes a Unicode object and returns the result as Python
1036 object. */
1037
1038PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001039 PyObject *unicode, /* Unicode object */
1040 const char *encoding, /* encoding */
1041 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001042 );
1043
Guido van Rossumd8225182000-03-10 22:33:05 +00001044/* Encodes a Unicode object and returns the result as Python string
1045 object. */
1046
Mark Hammond91a681d2002-08-12 07:21:58 +00001047PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 PyObject *unicode, /* Unicode object */
1049 const char *encoding, /* encoding */
1050 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001051 );
1052
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001053/* Encodes a Unicode object and returns the result as Unicode
1054 object. */
1055
1056PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 PyObject *unicode, /* Unicode object */
1058 const char *encoding, /* encoding */
1059 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001060 );
1061
1062/* Build an encoding map. */
1063
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001064PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1065 PyObject* string /* 256 character map */
1066 );
1067
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068/* --- UTF-7 Codecs ------------------------------------------------------- */
1069
Mark Hammond91a681d2002-08-12 07:21:58 +00001070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const char *string, /* UTF-7 encoded string */
1072 Py_ssize_t length, /* size of string */
1073 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 );
1075
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001076PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 const char *string, /* UTF-7 encoded string */
1078 Py_ssize_t length, /* size of string */
1079 const char *errors, /* error handling */
1080 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001081 );
1082
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001083#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001084PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 const Py_UNICODE *data, /* Unicode char buffer */
1086 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1087 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1088 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1089 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001091PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1092 PyObject *unicode, /* Unicode object */
1093 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1094 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1095 const char *errors /* error handling */
1096 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001097#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001098
Guido van Rossumd8225182000-03-10 22:33:05 +00001099/* --- UTF-8 Codecs ------------------------------------------------------- */
1100
Mark Hammond91a681d2002-08-12 07:21:58 +00001101PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102 const char *string, /* UTF-8 encoded string */
1103 Py_ssize_t length, /* size of string */
1104 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001105 );
1106
Walter Dörwald69652032004-09-07 20:24:22 +00001107PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 const char *string, /* UTF-8 encoded string */
1109 Py_ssize_t length, /* size of string */
1110 const char *errors, /* error handling */
1111 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001112 );
1113
Mark Hammond91a681d2002-08-12 07:21:58 +00001114PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001116 );
1117
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001118#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1120 PyObject *unicode,
1121 const char *errors);
1122
Mark Hammond91a681d2002-08-12 07:21:58 +00001123PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001124 const Py_UNICODE *data, /* Unicode char buffer */
1125 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1126 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001127 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001128#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001129
Walter Dörwald41980ca2007-08-16 21:55:45 +00001130/* --- UTF-32 Codecs ------------------------------------------------------ */
1131
1132/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1133 the corresponding Unicode object.
1134
1135 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001137
1138 If byteorder is non-NULL, the decoder starts decoding using the
1139 given byte order:
1140
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 *byteorder == -1: little endian
1142 *byteorder == 0: native order
1143 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001144
1145 In native mode, the first four bytes of the stream are checked for a
1146 BOM mark. If found, the BOM mark is analysed, the byte order
1147 adjusted and the BOM skipped. In the other modes, no BOM mark
1148 interpretation is done. After completion, *byteorder is set to the
1149 current byte order at the end of input data.
1150
1151 If byteorder is NULL, the codec starts in native order mode.
1152
1153*/
1154
1155PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001156 const char *string, /* UTF-32 encoded string */
1157 Py_ssize_t length, /* size of string */
1158 const char *errors, /* error handling */
1159 int *byteorder /* pointer to byteorder to use
1160 0=native;-1=LE,1=BE; updated on
1161 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001162 );
1163
1164PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 const char *string, /* UTF-32 encoded string */
1166 Py_ssize_t length, /* size of string */
1167 const char *errors, /* error handling */
1168 int *byteorder, /* pointer to byteorder to use
1169 0=native;-1=LE,1=BE; updated on
1170 exit */
1171 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001172 );
1173
1174/* Returns a Python string using the UTF-32 encoding in native byte
1175 order. The string always starts with a BOM mark. */
1176
1177PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001179 );
1180
1181/* Returns a Python string object holding the UTF-32 encoded value of
1182 the Unicode data.
1183
1184 If byteorder is not 0, output is written according to the following
1185 byte order:
1186
1187 byteorder == -1: little endian
1188 byteorder == 0: native byte order (writes a BOM mark)
1189 byteorder == 1: big endian
1190
1191 If byteorder is 0, the output string will always start with the
1192 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1193 prepended.
1194
1195*/
1196
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001197#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001198PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001199 const Py_UNICODE *data, /* Unicode char buffer */
1200 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1201 const char *errors, /* error handling */
1202 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001203 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001204PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1205 PyObject *object, /* Unicode object */
1206 const char *errors, /* error handling */
1207 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1208 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001209#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001210
Guido van Rossumd8225182000-03-10 22:33:05 +00001211/* --- UTF-16 Codecs ------------------------------------------------------ */
1212
Guido van Rossum9e896b32000-04-05 20:11:21 +00001213/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001214 the corresponding Unicode object.
1215
1216 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001218
1219 If byteorder is non-NULL, the decoder starts decoding using the
1220 given byte order:
1221
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 *byteorder == -1: little endian
1223 *byteorder == 0: native order
1224 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001225
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001226 In native mode, the first two bytes of the stream are checked for a
1227 BOM mark. If found, the BOM mark is analysed, the byte order
1228 adjusted and the BOM skipped. In the other modes, no BOM mark
1229 interpretation is done. After completion, *byteorder is set to the
1230 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001231
1232 If byteorder is NULL, the codec starts in native order mode.
1233
1234*/
1235
Mark Hammond91a681d2002-08-12 07:21:58 +00001236PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001237 const char *string, /* UTF-16 encoded string */
1238 Py_ssize_t length, /* size of string */
1239 const char *errors, /* error handling */
1240 int *byteorder /* pointer to byteorder to use
1241 0=native;-1=LE,1=BE; updated on
1242 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001243 );
1244
Walter Dörwald69652032004-09-07 20:24:22 +00001245PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001246 const char *string, /* UTF-16 encoded string */
1247 Py_ssize_t length, /* size of string */
1248 const char *errors, /* error handling */
1249 int *byteorder, /* pointer to byteorder to use
1250 0=native;-1=LE,1=BE; updated on
1251 exit */
1252 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001253 );
1254
Guido van Rossumd8225182000-03-10 22:33:05 +00001255/* Returns a Python string using the UTF-16 encoding in native byte
1256 order. The string always starts with a BOM mark. */
1257
Mark Hammond91a681d2002-08-12 07:21:58 +00001258PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001259 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001260 );
1261
1262/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001263 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001264
1265 If byteorder is not 0, output is written according to the following
1266 byte order:
1267
1268 byteorder == -1: little endian
1269 byteorder == 0: native byte order (writes a BOM mark)
1270 byteorder == 1: big endian
1271
1272 If byteorder is 0, the output string will always start with the
1273 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1274 prepended.
1275
1276 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1277 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001278 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001279
1280*/
1281
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001282#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001283PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 const Py_UNICODE *data, /* Unicode char buffer */
1285 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1286 const char *errors, /* error handling */
1287 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001288 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001289PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1290 PyObject* unicode, /* Unicode object */
1291 const char *errors, /* error handling */
1292 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1293 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001294#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001295
1296/* --- Unicode-Escape Codecs ---------------------------------------------- */
1297
Mark Hammond91a681d2002-08-12 07:21:58 +00001298PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001299 const char *string, /* Unicode-Escape encoded string */
1300 Py_ssize_t length, /* size of string */
1301 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 );
1303
Mark Hammond91a681d2002-08-12 07:21:58 +00001304PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001306 );
1307
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001308#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001309PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 const Py_UNICODE *data, /* Unicode char buffer */
1311 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001312 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001313#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001314
1315/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1316
Mark Hammond91a681d2002-08-12 07:21:58 +00001317PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 const char *string, /* Raw-Unicode-Escape encoded string */
1319 Py_ssize_t length, /* size of string */
1320 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001321 );
1322
Mark Hammond91a681d2002-08-12 07:21:58 +00001323PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001325 );
1326
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001327#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001328PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 const Py_UNICODE *data, /* Unicode char buffer */
1330 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001331 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001332#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001333
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001334/* --- Unicode Internal Codec ---------------------------------------------
1335
1336 Only for internal use in _codecsmodule.c */
1337
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001338#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001339PyObject *_PyUnicode_DecodeUnicodeInternal(
1340 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001341 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001342 const char *errors
1343 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001344#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001345
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001347
1348 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1349
1350*/
1351
Mark Hammond91a681d2002-08-12 07:21:58 +00001352PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001353 const char *string, /* Latin-1 encoded string */
1354 Py_ssize_t length, /* size of string */
1355 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001356 );
1357
Mark Hammond91a681d2002-08-12 07:21:58 +00001358PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001360 );
1361
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001362#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1364 PyObject* unicode,
1365 const char* errors);
1366
Mark Hammond91a681d2002-08-12 07:21:58 +00001367PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 const Py_UNICODE *data, /* Unicode char buffer */
1369 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1370 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001371 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001372#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001373
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001375
1376 Only 7-bit ASCII data is excepted. All other codes generate errors.
1377
1378*/
1379
Mark Hammond91a681d2002-08-12 07:21:58 +00001380PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 const char *string, /* ASCII encoded string */
1382 Py_ssize_t length, /* size of string */
1383 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001384 );
1385
Mark Hammond91a681d2002-08-12 07:21:58 +00001386PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001388 );
1389
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001390#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1392 PyObject* unicode,
1393 const char* errors);
1394
Mark Hammond91a681d2002-08-12 07:21:58 +00001395PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 const Py_UNICODE *data, /* Unicode char buffer */
1397 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1398 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001399 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001400#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001401
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001403
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001405
1406 Decoding mappings must map single string characters to single
1407 Unicode characters, integers (which are then interpreted as Unicode
1408 ordinals) or None (meaning "undefined mapping" and causing an
1409 error).
1410
1411 Encoding mappings must map single Unicode characters to single
1412 string characters, integers (which are then interpreted as Latin-1
1413 ordinals) or None (meaning "undefined mapping" and causing an
1414 error).
1415
1416 If a character lookup fails with a LookupError, the character is
1417 copied as-is meaning that its ordinal value will be interpreted as
1418 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1419 to contain those mappings which map characters to different code
1420 points.
1421
1422*/
1423
Mark Hammond91a681d2002-08-12 07:21:58 +00001424PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 const char *string, /* Encoded string */
1426 Py_ssize_t length, /* size of string */
1427 PyObject *mapping, /* character mapping
1428 (char ordinal -> unicode ordinal) */
1429 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001430 );
1431
Mark Hammond91a681d2002-08-12 07:21:58 +00001432PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 PyObject *unicode, /* Unicode object */
1434 PyObject *mapping /* character mapping
1435 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001436 );
1437
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001438#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001439PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 const Py_UNICODE *data, /* Unicode char buffer */
1441 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1442 PyObject *mapping, /* character mapping
1443 (unicode ordinal -> char ordinal) */
1444 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001445 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001446PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1447 PyObject *unicode, /* Unicode object */
1448 PyObject *mapping, /* character mapping
1449 (unicode ordinal -> char ordinal) */
1450 const char *errors /* error handling */
1451 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001452#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001453
1454/* Translate a Py_UNICODE buffer of the given length by applying a
1455 character mapping table to it and return the resulting Unicode
1456 object.
1457
1458 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001459 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001460
1461 Mapping tables may be dictionaries or sequences. Unmapped character
1462 ordinals (ones which cause a LookupError) are left untouched and
1463 are copied as-is.
1464
1465*/
1466
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001467#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001468PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001469 const Py_UNICODE *data, /* Unicode char buffer */
1470 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1471 PyObject *table, /* Translate table */
1472 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001473 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001474#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001475
Victor Stinner99b95382011-07-04 14:23:54 +02001476#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001477
Guido van Rossumefec1152000-03-28 02:01:15 +00001478/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001479
Mark Hammond91a681d2002-08-12 07:21:58 +00001480PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001481 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001482 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001483 const char *errors /* error handling */
1484 );
1485
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001486PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1487 const char *string, /* MBCS encoded string */
1488 Py_ssize_t length, /* size of string */
1489 const char *errors, /* error handling */
1490 Py_ssize_t *consumed /* bytes consumed */
1491 );
1492
Victor Stinner3a50e702011-10-18 21:21:00 +02001493PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1494 int code_page, /* code page number */
1495 const char *string, /* encoded string */
1496 Py_ssize_t length, /* size of string */
1497 const char *errors, /* error handling */
1498 Py_ssize_t *consumed /* bytes consumed */
1499 );
1500
Mark Hammond91a681d2002-08-12 07:21:58 +00001501PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001502 PyObject *unicode /* Unicode object */
1503 );
1504
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001505#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001506PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001507 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001508 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001509 const char *errors /* error handling */
1510 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001511#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001512
Victor Stinner3a50e702011-10-18 21:21:00 +02001513PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1514 int code_page, /* code page number */
1515 PyObject *unicode, /* Unicode object */
1516 const char *errors /* error handling */
1517 );
1518
Victor Stinner99b95382011-07-04 14:23:54 +02001519#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001520
Guido van Rossum9e896b32000-04-05 20:11:21 +00001521/* --- Decimal Encoder ---------------------------------------------------- */
1522
1523/* Takes a Unicode string holding a decimal value and writes it into
1524 an output buffer using standard ASCII digit codes.
1525
1526 The output buffer has to provide at least length+1 bytes of storage
1527 area. The output string is 0-terminated.
1528
1529 The encoder converts whitespace to ' ', decimal characters to their
1530 corresponding ASCII digit and all other Latin-1 characters except
1531 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1532 are treated as errors. This includes embedded NULL bytes.
1533
1534 Error handling is defined by the errors argument:
1535
1536 NULL or "strict": raise a ValueError
1537 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001538 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001539 "replace": replaces illegal characters with '?'
1540
1541 Returns 0 on success, -1 on failure.
1542
1543*/
1544
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001545#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001546PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001547 Py_UNICODE *s, /* Unicode buffer */
1548 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1549 char *output, /* Output buffer; must have size >= length */
1550 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001551 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001552#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001553
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001554/* Transforms code points that have decimal digit property to the
1555 corresponding ASCII digit code points.
1556
1557 Returns a new Unicode string on success, NULL on failure.
1558*/
1559
Georg Brandlb5503082010-12-05 11:40:48 +00001560#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001561PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1562 Py_UNICODE *s, /* Unicode buffer */
1563 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1564 );
Georg Brandlb5503082010-12-05 11:40:48 +00001565#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1568 as argument instead of a raw buffer and length. This function additionally
1569 transforms spaces to ASCII because this is what the callers in longobject,
1570 floatobject, and complexobject did anyways. */
1571
1572#ifndef Py_LIMITED_API
1573PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1574 PyObject *unicode /* Unicode object */
1575 );
1576#endif
1577
Martin v. Löwis011e8422009-05-05 04:43:17 +00001578/* --- File system encoding ---------------------------------------------- */
1579
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001580/* ParseTuple converter: encode str objects to bytes using
1581 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001582
1583PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1584
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001585/* ParseTuple converter: decode bytes objects to unicode using
1586 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1587
1588PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1589
Victor Stinner77c38622010-05-14 15:58:55 +00001590/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1591 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001592
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001593 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1594 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001595
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001596 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001597*/
1598
1599PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1600 const char *s /* encoded string */
1601 );
1602
Victor Stinner77c38622010-05-14 15:58:55 +00001603/* Decode a string using Py_FileSystemDefaultEncoding
1604 and the "surrogateescape" error handler.
1605
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001606 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1607 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001608*/
1609
Martin v. Löwis011e8422009-05-05 04:43:17 +00001610PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1611 const char *s, /* encoded string */
1612 Py_ssize_t size /* size */
1613 );
1614
Victor Stinnerae6265f2010-05-15 16:27:27 +00001615/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001616 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001617
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001618 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1619 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001620*/
1621
1622PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1623 PyObject *unicode
1624 );
1625
Guido van Rossumd8225182000-03-10 22:33:05 +00001626/* --- Methods & Slots ----------------------------------------------------
1627
1628 These are capable of handling Unicode objects and strings on input
1629 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001630 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001631
1632/* Concat two strings giving a new Unicode string. */
1633
Mark Hammond91a681d2002-08-12 07:21:58 +00001634PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 PyObject *left, /* Left string */
1636 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001637 );
1638
Walter Dörwald1ab83302007-05-18 17:15:44 +00001639/* Concat two strings and put the result in *pleft
1640 (sets *pleft to NULL on error) */
1641
1642PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 PyObject **pleft, /* Pointer to left string */
1644 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001645 );
1646
1647/* Concat two strings, put the result in *pleft and drop the right object
1648 (sets *pleft to NULL on error) */
1649
1650PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 PyObject **pleft, /* Pointer to left string */
1652 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001653 );
1654
Guido van Rossumd8225182000-03-10 22:33:05 +00001655/* Split a string giving a list of Unicode strings.
1656
1657 If sep is NULL, splitting will be done at all whitespace
1658 substrings. Otherwise, splits occur at the given separator.
1659
1660 At most maxsplit splits will be done. If negative, no limit is set.
1661
1662 Separators are not included in the resulting list.
1663
1664*/
1665
Mark Hammond91a681d2002-08-12 07:21:58 +00001666PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001667 PyObject *s, /* String to split */
1668 PyObject *sep, /* String separator */
1669 Py_ssize_t maxsplit /* Maxsplit count */
1670 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001671
1672/* Dito, but split at line breaks.
1673
1674 CRLF is considered to be one line break. Line breaks are not
1675 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001676
Mark Hammond91a681d2002-08-12 07:21:58 +00001677PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001678 PyObject *s, /* String to split */
1679 int keepends /* If true, line end markers are included */
1680 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001681
Thomas Wouters477c8d52006-05-27 19:21:47 +00001682/* Partition a string using a given separator. */
1683
1684PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 PyObject *s, /* String to partition */
1686 PyObject *sep /* String separator */
1687 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001688
1689/* Partition a string using a given separator, searching from the end of the
1690 string. */
1691
1692PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 PyObject *s, /* String to partition */
1694 PyObject *sep /* String separator */
1695 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001696
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001697/* Split a string giving a list of Unicode strings.
1698
1699 If sep is NULL, splitting will be done at all whitespace
1700 substrings. Otherwise, splits occur at the given separator.
1701
1702 At most maxsplit splits will be done. But unlike PyUnicode_Split
1703 PyUnicode_RSplit splits from the end of the string. If negative,
1704 no limit is set.
1705
1706 Separators are not included in the resulting list.
1707
1708*/
1709
1710PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001711 PyObject *s, /* String to split */
1712 PyObject *sep, /* String separator */
1713 Py_ssize_t maxsplit /* Maxsplit count */
1714 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001715
Guido van Rossumd8225182000-03-10 22:33:05 +00001716/* Translate a string by applying a character mapping table to it and
1717 return the resulting Unicode object.
1718
1719 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001721
1722 Mapping tables may be dictionaries or sequences. Unmapped character
1723 ordinals (ones which cause a LookupError) are left untouched and
1724 are copied as-is.
1725
1726*/
1727
Mark Hammond91a681d2002-08-12 07:21:58 +00001728PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001729 PyObject *str, /* String */
1730 PyObject *table, /* Translate table */
1731 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001732 );
1733
1734/* Join a sequence of strings using the given separator and return
1735 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736
Mark Hammond91a681d2002-08-12 07:21:58 +00001737PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001738 PyObject *separator, /* Separator string */
1739 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001740 );
1741
1742/* Return 1 if substr matches str[start:end] at the given tail end, 0
1743 otherwise. */
1744
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001746 PyObject *str, /* String */
1747 PyObject *substr, /* Prefix or Suffix string */
1748 Py_ssize_t start, /* Start index */
1749 Py_ssize_t end, /* Stop index */
1750 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001751 );
1752
1753/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001754 given search direction or -1 if not found. -2 is returned in case
1755 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001756
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001758 PyObject *str, /* String */
1759 PyObject *substr, /* Substring to find */
1760 Py_ssize_t start, /* Start index */
1761 Py_ssize_t end, /* Stop index */
1762 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001763 );
1764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765/* Like PyUnicode_Find, but search for single character only. */
1766PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1767 PyObject *str,
1768 Py_UCS4 ch,
1769 Py_ssize_t start,
1770 Py_ssize_t end,
1771 int direction
1772 );
1773
Barry Warsaw51ac5802000-03-20 16:36:48 +00001774/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001775
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001777 PyObject *str, /* String */
1778 PyObject *substr, /* Substring to count */
1779 Py_ssize_t start, /* Start index */
1780 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001781 );
1782
Barry Warsaw51ac5802000-03-20 16:36:48 +00001783/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001784 and return the resulting Unicode object. */
1785
Mark Hammond91a681d2002-08-12 07:21:58 +00001786PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001787 PyObject *str, /* String */
1788 PyObject *substr, /* Substring to find */
1789 PyObject *replstr, /* Substring to replace */
1790 Py_ssize_t maxcount /* Max. number of replacements to apply;
1791 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001792 );
1793
1794/* Compare two strings and return -1, 0, 1 for less than, equal,
1795 greater than resp. */
1796
Mark Hammond91a681d2002-08-12 07:21:58 +00001797PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001798 PyObject *left, /* Left string */
1799 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001800 );
1801
Martin v. Löwis5b222132007-06-10 09:51:05 +00001802PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1803 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001804 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001805 );
1806
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001807/* Rich compare two strings and return one of the following:
1808
1809 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001810 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001811 - Py_NotImplemented in case the type combination is unknown
1812
1813 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1814 case the conversion of the arguments to Unicode fails with a
1815 UnicodeDecodeError.
1816
1817 Possible values for op:
1818
1819 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1820
1821*/
1822
1823PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001824 PyObject *left, /* Left string */
1825 PyObject *right, /* Right string */
1826 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001827 );
1828
Thomas Wouters7e474022000-07-16 12:04:32 +00001829/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001830 the resulting Unicode string. */
1831
Mark Hammond91a681d2002-08-12 07:21:58 +00001832PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001833 PyObject *format, /* Format string */
1834 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001835 );
1836
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001837/* Checks whether element is contained in container and return 1/0
1838 accordingly.
1839
1840 element has to coerce to an one element Unicode string. -1 is
1841 returned in case of an error. */
1842
Mark Hammond91a681d2002-08-12 07:21:58 +00001843PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001844 PyObject *container, /* Container string */
1845 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001846 );
1847
Martin v. Löwis47383402007-08-15 07:32:56 +00001848/* Checks whether argument is a valid identifier. */
1849
1850PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1851
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001852#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001853/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001854PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001855 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001856 int striptype,
1857 PyObject *sepobj
1858 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001859#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001860
Eric Smith5807c412008-05-11 21:00:57 +00001861/* Using the current locale, insert the thousands grouping
1862 into the string pointed to by buffer. For the argument descriptions,
1863 see Objects/stringlib/localeutil.h */
1864
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001865#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001866PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1867 Py_ssize_t n_buffer,
1868 Py_UNICODE *digits,
1869 Py_ssize_t n_digits,
1870 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001871#endif
Eric Smith5807c412008-05-11 21:00:57 +00001872
Eric Smitha3b1ac82009-04-03 14:45:06 +00001873/* Using explicit passed-in values, insert the thousands grouping
1874 into the string pointed to by buffer. For the argument descriptions,
1875 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001876#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001878 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 int kind,
1880 void *buffer,
1881 Py_ssize_t n_buffer,
1882 void *digits,
1883 Py_ssize_t n_digits,
1884 Py_ssize_t min_width,
1885 const char *grouping,
1886 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001887#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001888/* === Characters Type APIs =============================================== */
1889
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001890/* Helper array used by Py_UNICODE_ISSPACE(). */
1891
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001892#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001893PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1894
Guido van Rossumd8225182000-03-10 22:33:05 +00001895/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001896 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001897
1898 These APIs are implemented in Objects/unicodectype.c.
1899
1900*/
1901
Mark Hammond91a681d2002-08-12 07:21:58 +00001902PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001903 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001904 );
1905
Mark Hammond91a681d2002-08-12 07:21:58 +00001906PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001907 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001908 );
1909
Mark Hammond91a681d2002-08-12 07:21:58 +00001910PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001911 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001912 );
1913
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001914PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001915 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001916 );
1917
1918PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001919 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001920 );
1921
Mark Hammond91a681d2002-08-12 07:21:58 +00001922PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001923 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001924 );
1925
Mark Hammond91a681d2002-08-12 07:21:58 +00001926PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001928 );
1929
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001930PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1931 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001934PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1935 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001938PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1939 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Mark Hammond91a681d2002-08-12 07:21:58 +00001942PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001943 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
Mark Hammond91a681d2002-08-12 07:21:58 +00001946PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001947 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001948 );
1949
Mark Hammond91a681d2002-08-12 07:21:58 +00001950PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001951 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001952 );
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001956 );
1957
Mark Hammond91a681d2002-08-12 07:21:58 +00001958PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001959 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960 );
1961
Mark Hammond91a681d2002-08-12 07:21:58 +00001962PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001963 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964 );
1965
Georg Brandl559e5d72008-06-11 18:37:52 +00001966PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001967 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001968 );
1969
Mark Hammond91a681d2002-08-12 07:21:58 +00001970PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001971 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001972 );
1973
Victor Stinneref8d95c2010-08-16 22:03:11 +00001974PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1975 const Py_UNICODE *u
1976 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977
1978PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001979 Py_UNICODE *s1,
1980 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001981
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001982PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1983 Py_UNICODE *s1, const Py_UNICODE *s2);
1984
Martin v. Löwis5b222132007-06-10 09:51:05 +00001985PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001986 Py_UNICODE *s1,
1987 const Py_UNICODE *s2,
1988 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001989
1990PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001991 const Py_UNICODE *s1,
1992 const Py_UNICODE *s2
1993 );
1994
1995PyAPI_FUNC(int) Py_UNICODE_strncmp(
1996 const Py_UNICODE *s1,
1997 const Py_UNICODE *s2,
1998 size_t n
1999 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002000
2001PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002002 const Py_UNICODE *s,
2003 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002004 );
2005
Victor Stinner331ea922010-08-10 16:37:20 +00002006PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002007 const Py_UNICODE *s,
2008 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002009 );
2010
Victor Stinner71133ff2010-09-01 23:43:53 +00002011/* Create a copy of a unicode string ending with a nul character. Return NULL
2012 and raise a MemoryError exception on memory allocation failure, otherwise
2013 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2014
Victor Stinner46408602010-09-03 16:18:00 +00002015PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002016 PyObject *unicode
2017 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002018#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002019
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002020#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002021PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002022 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002023 int check_content);
2024#endif
2025
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002026/********************* String Literals ****************************************/
2027/* This structure helps managing static strings. The basic usage goes like this:
2028 Instead of doing
2029
2030 r = PyObject_CallMethod(o, "foo", "args", ...);
2031
2032 do
2033
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002034 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002035 ...
2036 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2037
2038 PyId_foo is a static variable, either on block level or file level. On first
2039 usage, the string "foo" is interned, and the structures are linked. On interpreter
2040 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2041
2042 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002043 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002044 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2045*/
2046typedef struct _Py_Identifier {
2047 struct _Py_Identifier *next;
2048 const char* string;
2049 PyObject *object;
2050} _Py_Identifier;
2051
Martin v. Löwis87da8722011-10-09 11:54:42 +02002052#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002053#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002054
2055/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2056PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2057/* Clear all static strings. */
2058PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2059
Guido van Rossumd8225182000-03-10 22:33:05 +00002060#ifdef __cplusplus
2061}
2062#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002063#endif /* !Py_UNICODEOBJECT_H */