blob: 58c1f55455608a88da2019c5ff6d52d860824b2b [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200209 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200210
211 - compact ascii:
212
213 * structure = PyASCIIObject
214 * kind = PyUnicode_1BYTE_KIND
215 * compact = 1
216 * ascii = 1
217 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200218 * (length is the length of the utf8 and wstr strings)
219 * (data starts just after the structure)
220 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200221
222 - compact:
223
224 * structure = PyCompactUnicodeObject
225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226 PyUnicode_4BYTE_KIND
227 * compact = 1
228 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200229 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200230 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200231 * utf8_length = 0 if utf8 is NULL
232 * wstr is shared with data and wstr_length=length
233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200235 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200236 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200237
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200238 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200239
240 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100241 * length = 0 (use wstr_length)
242 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200243 * kind = PyUnicode_WCHAR_KIND
244 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200245 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200246 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100247 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200248 * wstr is not NULL
249 * data.any is NULL
250 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200251 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200252
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200253 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200254
255 * structure = PyUnicodeObject structure
256 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
257 PyUnicode_4BYTE_KIND
258 * compact = 0
259 * ready = 1
260 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200261 * utf8 is shared and utf8_length = length with data.any if ascii = 1
262 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100263 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200264 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
265 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
266 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200267
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200268 Compact strings use only one memory block (structure + characters),
269 whereas legacy strings use one block for the structure and one block
270 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200271
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200272 Legacy strings are created by PyUnicode_FromUnicode() and
273 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
274 when PyUnicode_READY() is called.
275
276 See also _PyUnicode_CheckConsistency().
277 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000278 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000280 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 struct {
282 /*
283 SSTATE_NOT_INTERNED (0)
284 SSTATE_INTERNED_MORTAL (1)
285 SSTATE_INTERNED_IMMORTAL (2)
286
287 If interned != SSTATE_NOT_INTERNED, the two references from the
288 dictionary to this object are *not* counted in ob_refcnt.
289 */
290 unsigned int interned:2;
291 /* Character size:
292
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200293 - PyUnicode_WCHAR_KIND (0):
294
295 * character type = wchar_t (16 or 32 bits, depending on the
296 platform)
297
298 - PyUnicode_1BYTE_KIND (1):
299
300 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200301 * if ascii is set, all characters must be in range
302 U+0000-U+007F, otherwise at least one character must be in range
303 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200304
305 - PyUnicode_2BYTE_KIND (2):
306
307 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200308 * at least one character must be in range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200309
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200310 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200311
312 * character type = Py_UCS4 (32 bits, unsigned)
313 * at least one character must be in range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200315 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Compact is with respect to the allocation scheme. Compact unicode
317 objects only require one memory block while non-compact objects use
318 one block for the PyUnicodeObject struct and another for its data
319 buffer. */
320 unsigned int compact:1;
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200321 /* The string only contains characters in range U+0000-U+007F (ASCII)
322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 unsigned int ascii:1;
325 /* The ready flag indicates whether the object layout is initialized
326 completely. This means that this is either a compact object, or
327 the data pointer is filled out. The bit is redundant, and helps
328 to minimize the test in PyUnicode_IS_READY(). */
329 unsigned int ready:1;
330 } state;
331 wchar_t *wstr; /* wchar_t representation (null-terminated) */
332} PyASCIIObject;
333
334/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200335 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200336 immediately follow the structure. */
337typedef struct {
338 PyASCIIObject _base;
339 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
340 * terminating \0. */
341 char *utf8; /* UTF-8 representation (null-terminated) */
342 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
343 * surrogates count as two code points. */
344} PyCompactUnicodeObject;
345
346/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
347 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200348 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200349typedef struct {
350 PyCompactUnicodeObject _base;
351 union {
352 void *any;
353 Py_UCS1 *latin1;
354 Py_UCS2 *ucs2;
355 Py_UCS4 *ucs4;
356 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000357} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000358#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000359
Mark Hammond91a681d2002-08-12 07:21:58 +0000360PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000361PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000362
Thomas Wouters27d517b2007-02-25 20:39:11 +0000363#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000364 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
365#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000366
367/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000368#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369
370#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200371 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372 ((PyASCIIObject*)op)->length : \
373 ((PyCompactUnicodeObject*)op)->wstr_length)
374
375/* Returns the deprecated Py_UNICODE representation's size in code units
376 (this includes surrogate pairs as 2 units).
377 If the Py_UNICODE representation is not available, it will be computed
378 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
379
Guido van Rossumd8225182000-03-10 22:33:05 +0000380#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381 (assert(PyUnicode_Check(op)), \
382 (((PyASCIIObject *)(op))->wstr) ? \
383 PyUnicode_WSTR_LENGTH(op) : \
384 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
385 PyUnicode_WSTR_LENGTH(op)))
386
Guido van Rossumd8225182000-03-10 22:33:05 +0000387#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
389
390/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
391 representation on demand. Using this macro is very inefficient now,
392 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
393 use PyUnicode_WRITE() and PyUnicode_READ(). */
394
Guido van Rossumd8225182000-03-10 22:33:05 +0000395#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 (assert(PyUnicode_Check(op)), \
397 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
398 PyUnicode_AsUnicode((PyObject *)(op)))
399
Guido van Rossumd8225182000-03-10 22:33:05 +0000400#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 ((const char *)(PyUnicode_AS_UNICODE(op)))
402
403
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200404/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405
406/* Values for PyUnicodeObject.state: */
407
408/* Interning state. */
409#define SSTATE_NOT_INTERNED 0
410#define SSTATE_INTERNED_MORTAL 1
411#define SSTATE_INTERNED_IMMORTAL 2
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413/* Return true if the string contains only ASCII characters, or 0 if not. The
414 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
415 or Ready calls are performed. */
416#define PyUnicode_IS_ASCII(op) \
417 (((PyASCIIObject*)op)->state.ascii)
418
419/* Return true if the string is compact or 0 if not.
420 No type checks or Ready calls are performed. */
421#define PyUnicode_IS_COMPACT(op) \
422 (((PyASCIIObject*)(op))->state.compact)
423
424/* Return true if the string is a compact ASCII string (use PyASCIIObject
425 structure), or 0 if not. No type checks or Ready calls are performed. */
426#define PyUnicode_IS_COMPACT_ASCII(op) \
427 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200429enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200431 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200433 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200434/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200435 PyUnicode_1BYTE_KIND = 1,
436 PyUnicode_2BYTE_KIND = 2,
437 PyUnicode_4BYTE_KIND = 4
438};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439
Georg Brandl4975a9b2011-10-05 16:12:21 +0200440/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200442 No checks are performed, use PyUnicode_KIND() before to ensure
443 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444
445#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
446#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
447#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
448
Victor Stinner157f83f2011-09-28 21:41:31 +0200449/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450#define PyUnicode_KIND(op) \
451 (assert(PyUnicode_Check(op)), \
452 assert(PyUnicode_IS_READY(op)), \
453 ((PyASCIIObject *)(op))->state.kind)
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200457 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458 ((void*)((PyASCIIObject*)(op) + 1)) : \
459 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
460
461#define _PyUnicode_NONCOMPACT_DATA(op) \
462 (assert(((PyUnicodeObject*)(op))->data.any), \
463 ((((PyUnicodeObject *)(op))->data.any)))
464
465#define PyUnicode_DATA(op) \
466 (assert(PyUnicode_Check(op)), \
467 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
468 _PyUnicode_NONCOMPACT_DATA(op))
469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200470/* In the access macros below, "kind" may be evaluated more than once.
471 All other macro parameters are evaluated exactly once, so it is safe
472 to put side effects into them (such as increasing the index). */
473
474/* Write into the canonical representation, this macro does not do any sanity
475 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200476 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200478 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200479#define PyUnicode_WRITE(kind, data, index, value) \
480 do { \
481 switch ((kind)) { \
482 case PyUnicode_1BYTE_KIND: { \
483 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
484 break; \
485 } \
486 case PyUnicode_2BYTE_KIND: { \
487 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
488 break; \
489 } \
490 default: { \
491 assert((kind) == PyUnicode_4BYTE_KIND); \
492 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
493 } \
494 } \
495 } while (0)
496
Georg Brandl07de3252011-10-05 16:47:38 +0200497/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498 or ready calls are performed. */
499#define PyUnicode_READ(kind, data, index) \
500 ((Py_UCS4) \
501 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200502 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200503 ((kind) == PyUnicode_2BYTE_KIND ? \
504 ((const Py_UCS2 *)(data))[(index)] : \
505 ((const Py_UCS4 *)(data))[(index)] \
506 ) \
507 ))
508
509/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
510 calls PyUnicode_KIND() and might call it twice. For single reads, use
511 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
512 cache kind and use PyUnicode_READ instead. */
513#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200514 (assert(PyUnicode_Check(unicode)), \
515 assert(PyUnicode_IS_READY(unicode)), \
516 (Py_UCS4) \
517 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
518 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
519 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
520 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
521 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
522 ) \
523 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524
525/* Returns the length of the unicode string. The caller has to make sure that
526 the string has it's canonical representation set before calling
527 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
528#define PyUnicode_GET_LENGTH(op) \
529 (assert(PyUnicode_Check(op)), \
530 assert(PyUnicode_IS_READY(op)), \
531 ((PyASCIIObject *)(op))->length)
532
533
534/* Fast check to determine whether an object is ready. Equivalent to
535 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
536
537#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
538
Victor Stinnera3b334d2011-10-03 13:53:37 +0200539/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200540 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200541 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 Returns 0 on success and -1 on errors. */
543#define PyUnicode_READY(op) \
544 (assert(PyUnicode_Check(op)), \
545 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200546 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548/* Return a maximum character value which is suitable for creating another
549 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200550 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define PyUnicode_MAX_CHAR_VALUE(op) \
552 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200553 (PyUnicode_IS_ASCII(op) ? \
554 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200556 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200558 (0xffffU) : \
559 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000561#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000562
563/* --- Constants ---------------------------------------------------------- */
564
565/* This Unicode character will be used as replacement character during
566 decoding if the errors argument is set to "replace". Note: the
567 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
568 Unicode 3.0. */
569
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200570#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000571
572/* === Public API ========================================================= */
573
574/* --- Plain Py_UNICODE --------------------------------------------------- */
575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200576/* With PEP 393, this is the recommended way to allocate a new unicode object.
577 This function will allocate the object and its buffer in a single memory
578 block. Objects created using this function are not resizable. */
579#ifndef Py_LIMITED_API
580PyAPI_FUNC(PyObject*) PyUnicode_New(
581 Py_ssize_t size, /* Number of code points in the new string */
582 Py_UCS4 maxchar /* maximum code point value in the string */
583 );
584#endif
585
Victor Stinnerd8f65102011-09-29 19:43:17 +0200586/* Initializes the canonical string representation from a the deprecated
587 wstr/Py_UNICODE representation. This function is used to convert Unicode
588 objects which were created using the old API to the new flexible format
589 introduced with PEP 393.
590
591 Don't call this function directly, use the public PyUnicode_READY() macro
592 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593#ifndef Py_LIMITED_API
594PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200595 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596 );
597#endif
598
Victor Stinner034f6cf2011-09-30 02:26:44 +0200599/* Get a copy of a Unicode string. */
600PyAPI_FUNC(PyObject*) PyUnicode_Copy(
601 PyObject *unicode
602 );
603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200605 character conversion when necessary and falls back to memcpy if possible.
606
Victor Stinnera0702ab2011-09-29 14:14:38 +0200607 Fail if to is too small (smaller than how_many or smaller than
608 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
609 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200610
611 Return the number of written character, or return -1 and raise an exception
612 on error.
613
614 Pseudo-code:
615
616 how_many = min(how_many, len(from) - from_start)
617 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
618 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200619
620 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200623PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624 PyObject *to,
625 Py_ssize_t to_start,
626 PyObject *from,
627 Py_ssize_t from_start,
628 Py_ssize_t how_many
629 );
630#endif
631
Guido van Rossumd8225182000-03-10 22:33:05 +0000632/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000634
635 u may be NULL which causes the contents to be undefined. It is the
636 user's responsibility to fill in the needed data afterwards. Note
637 that modifying the Unicode object contents after construction is
638 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000639
640 The buffer is copied into the new object. */
641
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000642#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000643PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000644 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000645 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000646 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000647#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000648
Georg Brandl952867a2010-06-27 10:17:12 +0000649/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000650PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000651 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000652 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000653 );
654
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000655/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200656 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000657PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000658 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000659 );
660
Victor Stinnerb9275c12011-10-05 14:01:42 +0200661/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
662 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifndef Py_LIMITED_API
664PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
665 int kind,
666 const void *buffer,
667 Py_ssize_t size);
668#endif
669
670PyAPI_FUNC(PyObject*) PyUnicode_Substring(
671 PyObject *str,
672 Py_ssize_t start,
673 Py_ssize_t end);
674
Georg Brandldb6c7f52011-10-07 11:19:11 +0200675/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676 is set. Return NULL and raise an exception on error. Raise a ValueError if
677 the buffer is smaller than the string. Return buffer on success.
678
679 buflen is the length of the buffer in (Py_UCS4) characters. */
680PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
681 PyObject *unicode,
682 Py_UCS4* buffer,
683 Py_ssize_t buflen,
684 int copy_null);
685
686/* Copy the string into a UCS4 buffer. A new buffer is allocated using
687 * PyMem_Malloc; if this fails, NULL is returned with a memory error
688 exception set. */
689PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
690
Guido van Rossumd8225182000-03-10 22:33:05 +0000691/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 Py_UNICODE buffer.
693 If the wchar_t/Py_UNICODE representation is not yet available, this
694 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000695
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000696#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000697PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000699 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000700#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702/* Return a read-only pointer to the Unicode object's internal
703 Py_UNICODE buffer and save the length at size.
704 If the wchar_t/Py_UNICODE representation is not yet available, this
705 function will calculate it. */
706
707#ifndef Py_LIMITED_API
708PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
709 PyObject *unicode, /* Unicode object */
710 Py_ssize_t *size /* location where to save the length */
711 );
712#endif
713
Guido van Rossumd8225182000-03-10 22:33:05 +0000714/* Get the length of the Unicode object. */
715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
717 PyObject *unicode
718);
719
Victor Stinner157f83f2011-09-28 21:41:31 +0200720/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721 string representation. */
722
Martin v. Löwis18e16552006-02-15 17:27:45 +0000723PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000725 );
726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727/* Read a character from the string. */
728
729PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
730 PyObject *unicode,
731 Py_ssize_t index
732 );
733
734/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200735 PyUnicode_New, must not be shared, and must not have been hashed yet.
736
737 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738
739PyAPI_FUNC(int) PyUnicode_WriteChar(
740 PyObject *unicode,
741 Py_ssize_t index,
742 Py_UCS4 character
743 );
744
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000745#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000746/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000747PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000748#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000749
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200750/* Resize an Unicode object allocated by the legacy API (e.g.
751 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
752 PyUnicode_New) cannot be resized by this function.
753
754 The length is a number of Py_UNICODE characters (and not the number of code
755 points).
Guido van Rossum52c23592000-04-10 13:41:41 +0000756
757 *unicode is modified to point to the new (resized) object and 0
758 returned on success.
759
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200760 If the refcount on the object is 1, the function resizes the string in
761 place, which is usually faster than allocating a new string (and copy
762 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000763
764 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200765 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000766
Mark Hammond91a681d2002-08-12 07:21:58 +0000767PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 PyObject **unicode, /* Pointer to the Unicode object */
769 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000770 );
771
Guido van Rossumd8225182000-03-10 22:33:05 +0000772/* Coerce obj to an Unicode object and return a reference with
773 *incremented* refcount.
774
775 Coercion is done in the following way:
776
Georg Brandl952867a2010-06-27 10:17:12 +0000777 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000778 under the assumptions that they contain data using the UTF-8
779 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000780
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000781 2. All other objects (including Unicode objects) raise an
782 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000783
784 The API returns NULL in case of an error. The caller is responsible
785 for decref'ing the returned objects.
786
787*/
788
Mark Hammond91a681d2002-08-12 07:21:58 +0000789PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000791 const char *encoding, /* encoding */
792 const char *errors /* error handling */
793 );
794
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000795/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000796 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000798 Unicode objects are passed back as-is (subclasses are converted to
799 true Unicode objects), all other objects are delegated to
800 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000801 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000802
803 The API returns NULL in case of an error. The caller is responsible
804 for decref'ing the returned objects.
805
806*/
807
Mark Hammond91a681d2002-08-12 07:21:58 +0000808PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000810 );
811
Victor Stinner1205f272010-09-11 00:54:47 +0000812PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
813 const char *format, /* ASCII-encoded string */
814 va_list vargs
815 );
816PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
817 const char *format, /* ASCII-encoded string */
818 ...
819 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000821#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000822/* Format the object based on the format_spec, as defined in PEP 3101
823 (Advanced String Formatting). */
824PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 PyObject *format_spec,
826 Py_ssize_t start,
827 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000828#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000829
Walter Dörwald16807132007-05-25 13:52:07 +0000830PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
831PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000832PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
833 const char *u /* UTF-8 encoded string */
834 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000836PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000837#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000838
839/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840#define PyUnicode_CHECK_INTERNED(op) \
841 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000842
Guido van Rossumd8225182000-03-10 22:33:05 +0000843/* --- wchar_t support for platforms which support it --------------------- */
844
845#ifdef HAVE_WCHAR_H
846
Georg Brandl952867a2010-06-27 10:17:12 +0000847/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000848 size.
849
850 The buffer is copied into the new object. */
851
Mark Hammond91a681d2002-08-12 07:21:58 +0000852PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000853 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000854 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000855 );
856
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000857/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000858 most size wchar_t characters are copied.
859
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000860 Note that the resulting wchar_t string may or may not be
861 0-terminated. It is the responsibility of the caller to make sure
862 that the wchar_t string is 0-terminated in case this is required by
863 the application.
864
865 Returns the number of wchar_t characters copied (excluding a
866 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000867 error. */
868
Martin v. Löwis18e16552006-02-15 17:27:45 +0000869PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000870 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000871 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000872 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000873 );
874
Victor Stinner137c34c2010-09-29 10:25:54 +0000875/* Convert the Unicode object to a wide character string. The output string
876 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200877 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000878
879 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
880 on success. On error, returns NULL, *size is undefined and raises a
881 MemoryError. */
882
883PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000884 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000885 Py_ssize_t *size /* number of characters of the result */
886 );
887
Victor Stinner9f789e72011-10-01 03:57:28 +0200888#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200890#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891
Guido van Rossumd8225182000-03-10 22:33:05 +0000892#endif
893
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000894/* --- Unicode ordinals --------------------------------------------------- */
895
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896/* Create a Unicode Object from the given Unicode code point ordinal.
897
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000898 The ordinal must be in range(0x10000) on narrow Python builds
899 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
900 raised in case it is not.
901
902*/
903
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000904PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000905
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000906/* --- Free-list management ----------------------------------------------- */
907
908/* Clear the free list used by the Unicode implementation.
909
910 This can be used to release memory used for objects on the free
911 list back to the Python memory allocator.
912
913*/
914
915PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
916
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000917/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000918
919 Many of these APIs take two arguments encoding and errors. These
920 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000921 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000922
Georg Brandl952867a2010-06-27 10:17:12 +0000923 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000924
925 Error handling is set by errors which may also be set to NULL
926 meaning to use the default handling defined for the codec. Default
927 error handling for all builtin codecs is "strict" (ValueErrors are
928 raised).
929
930 The codecs all use a similar interface. Only deviation from the
931 generic ones are documented.
932
933*/
934
Fred Drakecb093fe2000-05-09 19:51:53 +0000935/* --- Manage the default encoding ---------------------------------------- */
936
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000937/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000938 Unicode object unicode and the size of the encoded representation
939 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000940
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000941 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000942
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200943 This function caches the UTF-8 encoded string in the unicodeobject
944 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945 when the unicodeobject is deallocated.
946
947 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
948 support the previous internal function with the same behaviour.
949
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000950 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000951 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000952
953 *** If you need to access the Unicode object as UTF-8 bytes string,
954 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000955*/
956
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000957#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000959 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000960 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000962#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000963
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000964/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000965 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
968 in the unicodeobject.
969
970 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
971 support the previous internal function with the same behaviour.
972
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000973 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000974 extracted from the returned data.
975
976 *** This API is for interpreter INTERNAL USE ONLY and will likely
977 *** be removed or changed for Python 3.1.
978
979 *** If you need to access the Unicode object as UTF-8 bytes string,
980 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000981
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000982*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000983
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000984#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
986#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000987#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000988
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000989/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000990
Mark Hammond91a681d2002-08-12 07:21:58 +0000991PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000992
Guido van Rossumd8225182000-03-10 22:33:05 +0000993/* --- Generic Codecs ----------------------------------------------------- */
994
995/* Create a Unicode object by decoding the encoded string s of the
996 given size. */
997
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001001 const char *encoding, /* encoding */
1002 const char *errors /* error handling */
1003 );
1004
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001005/* Decode a Unicode object unicode and return the result as Python
1006 object. */
1007
1008PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 PyObject *unicode, /* Unicode object */
1010 const char *encoding, /* encoding */
1011 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001012 );
1013
1014/* Decode a Unicode object unicode and return the result as Unicode
1015 object. */
1016
1017PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001018 PyObject *unicode, /* Unicode object */
1019 const char *encoding, /* encoding */
1020 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001021 );
1022
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 Python string object. */
1025
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001026#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001027PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001028 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001030 const char *encoding, /* encoding */
1031 const char *errors /* error handling */
1032 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001033#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001034
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001035/* Encodes a Unicode object and returns the result as Python
1036 object. */
1037
1038PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001039 PyObject *unicode, /* Unicode object */
1040 const char *encoding, /* encoding */
1041 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001042 );
1043
Guido van Rossumd8225182000-03-10 22:33:05 +00001044/* Encodes a Unicode object and returns the result as Python string
1045 object. */
1046
Mark Hammond91a681d2002-08-12 07:21:58 +00001047PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 PyObject *unicode, /* Unicode object */
1049 const char *encoding, /* encoding */
1050 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001051 );
1052
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001053/* Encodes a Unicode object and returns the result as Unicode
1054 object. */
1055
1056PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 PyObject *unicode, /* Unicode object */
1058 const char *encoding, /* encoding */
1059 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001060 );
1061
1062/* Build an encoding map. */
1063
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001064PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1065 PyObject* string /* 256 character map */
1066 );
1067
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068/* --- UTF-7 Codecs ------------------------------------------------------- */
1069
Mark Hammond91a681d2002-08-12 07:21:58 +00001070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 const char *string, /* UTF-7 encoded string */
1072 Py_ssize_t length, /* size of string */
1073 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 );
1075
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001076PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 const char *string, /* UTF-7 encoded string */
1078 Py_ssize_t length, /* size of string */
1079 const char *errors, /* error handling */
1080 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001081 );
1082
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001083#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001084PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 const Py_UNICODE *data, /* Unicode char buffer */
1086 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1087 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1088 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1089 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001091#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092
Guido van Rossumd8225182000-03-10 22:33:05 +00001093/* --- UTF-8 Codecs ------------------------------------------------------- */
1094
Mark Hammond91a681d2002-08-12 07:21:58 +00001095PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001096 const char *string, /* UTF-8 encoded string */
1097 Py_ssize_t length, /* size of string */
1098 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001099 );
1100
Walter Dörwald69652032004-09-07 20:24:22 +00001101PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102 const char *string, /* UTF-8 encoded string */
1103 Py_ssize_t length, /* size of string */
1104 const char *errors, /* error handling */
1105 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001106 );
1107
Mark Hammond91a681d2002-08-12 07:21:58 +00001108PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001110 );
1111
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001112#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1114 PyObject *unicode,
1115 const char *errors);
1116
Mark Hammond91a681d2002-08-12 07:21:58 +00001117PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 const Py_UNICODE *data, /* Unicode char buffer */
1119 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1120 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001121 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001122#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001123
Walter Dörwald41980ca2007-08-16 21:55:45 +00001124/* --- UTF-32 Codecs ------------------------------------------------------ */
1125
1126/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1127 the corresponding Unicode object.
1128
1129 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001130 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001131
1132 If byteorder is non-NULL, the decoder starts decoding using the
1133 given byte order:
1134
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 *byteorder == -1: little endian
1136 *byteorder == 0: native order
1137 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001138
1139 In native mode, the first four bytes of the stream are checked for a
1140 BOM mark. If found, the BOM mark is analysed, the byte order
1141 adjusted and the BOM skipped. In the other modes, no BOM mark
1142 interpretation is done. After completion, *byteorder is set to the
1143 current byte order at the end of input data.
1144
1145 If byteorder is NULL, the codec starts in native order mode.
1146
1147*/
1148
1149PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001150 const char *string, /* UTF-32 encoded string */
1151 Py_ssize_t length, /* size of string */
1152 const char *errors, /* error handling */
1153 int *byteorder /* pointer to byteorder to use
1154 0=native;-1=LE,1=BE; updated on
1155 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001156 );
1157
1158PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001159 const char *string, /* UTF-32 encoded string */
1160 Py_ssize_t length, /* size of string */
1161 const char *errors, /* error handling */
1162 int *byteorder, /* pointer to byteorder to use
1163 0=native;-1=LE,1=BE; updated on
1164 exit */
1165 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001166 );
1167
1168/* Returns a Python string using the UTF-32 encoding in native byte
1169 order. The string always starts with a BOM mark. */
1170
1171PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001173 );
1174
1175/* Returns a Python string object holding the UTF-32 encoded value of
1176 the Unicode data.
1177
1178 If byteorder is not 0, output is written according to the following
1179 byte order:
1180
1181 byteorder == -1: little endian
1182 byteorder == 0: native byte order (writes a BOM mark)
1183 byteorder == 1: big endian
1184
1185 If byteorder is 0, the output string will always start with the
1186 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1187 prepended.
1188
1189*/
1190
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001191#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001192PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 const Py_UNICODE *data, /* Unicode char buffer */
1194 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1195 const char *errors, /* error handling */
1196 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001197 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001198#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001199
Guido van Rossumd8225182000-03-10 22:33:05 +00001200/* --- UTF-16 Codecs ------------------------------------------------------ */
1201
Guido van Rossum9e896b32000-04-05 20:11:21 +00001202/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001203 the corresponding Unicode object.
1204
1205 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001207
1208 If byteorder is non-NULL, the decoder starts decoding using the
1209 given byte order:
1210
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211 *byteorder == -1: little endian
1212 *byteorder == 0: native order
1213 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001214
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001215 In native mode, the first two bytes of the stream are checked for a
1216 BOM mark. If found, the BOM mark is analysed, the byte order
1217 adjusted and the BOM skipped. In the other modes, no BOM mark
1218 interpretation is done. After completion, *byteorder is set to the
1219 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001220
1221 If byteorder is NULL, the codec starts in native order mode.
1222
1223*/
1224
Mark Hammond91a681d2002-08-12 07:21:58 +00001225PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 const char *string, /* UTF-16 encoded string */
1227 Py_ssize_t length, /* size of string */
1228 const char *errors, /* error handling */
1229 int *byteorder /* pointer to byteorder to use
1230 0=native;-1=LE,1=BE; updated on
1231 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001232 );
1233
Walter Dörwald69652032004-09-07 20:24:22 +00001234PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001235 const char *string, /* UTF-16 encoded string */
1236 Py_ssize_t length, /* size of string */
1237 const char *errors, /* error handling */
1238 int *byteorder, /* pointer to byteorder to use
1239 0=native;-1=LE,1=BE; updated on
1240 exit */
1241 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001242 );
1243
Guido van Rossumd8225182000-03-10 22:33:05 +00001244/* Returns a Python string using the UTF-16 encoding in native byte
1245 order. The string always starts with a BOM mark. */
1246
Mark Hammond91a681d2002-08-12 07:21:58 +00001247PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001249 );
1250
1251/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001252 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001253
1254 If byteorder is not 0, output is written according to the following
1255 byte order:
1256
1257 byteorder == -1: little endian
1258 byteorder == 0: native byte order (writes a BOM mark)
1259 byteorder == 1: big endian
1260
1261 If byteorder is 0, the output string will always start with the
1262 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1263 prepended.
1264
1265 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1266 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001267 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001268
1269*/
1270
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001271#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001272PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 const Py_UNICODE *data, /* Unicode char buffer */
1274 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1275 const char *errors, /* error handling */
1276 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001277 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001278#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001279
1280/* --- Unicode-Escape Codecs ---------------------------------------------- */
1281
Mark Hammond91a681d2002-08-12 07:21:58 +00001282PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 const char *string, /* Unicode-Escape encoded string */
1284 Py_ssize_t length, /* size of string */
1285 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001286 );
1287
Mark Hammond91a681d2002-08-12 07:21:58 +00001288PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001290 );
1291
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001292#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001293PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 const Py_UNICODE *data, /* Unicode char buffer */
1295 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001296 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001298
1299/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1300
Mark Hammond91a681d2002-08-12 07:21:58 +00001301PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 const char *string, /* Raw-Unicode-Escape encoded string */
1303 Py_ssize_t length, /* size of string */
1304 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001305 );
1306
Mark Hammond91a681d2002-08-12 07:21:58 +00001307PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
1310
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001311#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001312PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 const Py_UNICODE *data, /* Unicode char buffer */
1314 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001315 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001316#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001317
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001318/* --- Unicode Internal Codec ---------------------------------------------
1319
1320 Only for internal use in _codecsmodule.c */
1321
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001323PyObject *_PyUnicode_DecodeUnicodeInternal(
1324 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001325 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001326 const char *errors
1327 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001328#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001329
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001331
1332 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1333
1334*/
1335
Mark Hammond91a681d2002-08-12 07:21:58 +00001336PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 const char *string, /* Latin-1 encoded string */
1338 Py_ssize_t length, /* size of string */
1339 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001340 );
1341
Mark Hammond91a681d2002-08-12 07:21:58 +00001342PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001344 );
1345
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001346#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1348 PyObject* unicode,
1349 const char* errors);
1350
Mark Hammond91a681d2002-08-12 07:21:58 +00001351PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001352 const Py_UNICODE *data, /* Unicode char buffer */
1353 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1354 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001355 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001356#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001357
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001358/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001359
1360 Only 7-bit ASCII data is excepted. All other codes generate errors.
1361
1362*/
1363
Mark Hammond91a681d2002-08-12 07:21:58 +00001364PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 const char *string, /* ASCII encoded string */
1366 Py_ssize_t length, /* size of string */
1367 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001368 );
1369
Mark Hammond91a681d2002-08-12 07:21:58 +00001370PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001372 );
1373
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001374#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1376 PyObject* unicode,
1377 const char* errors);
1378
Mark Hammond91a681d2002-08-12 07:21:58 +00001379PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 const Py_UNICODE *data, /* Unicode char buffer */
1381 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1382 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001383 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001384#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001385
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001387
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001389
1390 Decoding mappings must map single string characters to single
1391 Unicode characters, integers (which are then interpreted as Unicode
1392 ordinals) or None (meaning "undefined mapping" and causing an
1393 error).
1394
1395 Encoding mappings must map single Unicode characters to single
1396 string characters, integers (which are then interpreted as Latin-1
1397 ordinals) or None (meaning "undefined mapping" and causing an
1398 error).
1399
1400 If a character lookup fails with a LookupError, the character is
1401 copied as-is meaning that its ordinal value will be interpreted as
1402 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1403 to contain those mappings which map characters to different code
1404 points.
1405
1406*/
1407
Mark Hammond91a681d2002-08-12 07:21:58 +00001408PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 const char *string, /* Encoded string */
1410 Py_ssize_t length, /* size of string */
1411 PyObject *mapping, /* character mapping
1412 (char ordinal -> unicode ordinal) */
1413 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001414 );
1415
Mark Hammond91a681d2002-08-12 07:21:58 +00001416PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 PyObject *unicode, /* Unicode object */
1418 PyObject *mapping /* character mapping
1419 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001420 );
1421
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001422#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001423PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001424 const Py_UNICODE *data, /* Unicode char buffer */
1425 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1426 PyObject *mapping, /* character mapping
1427 (unicode ordinal -> char ordinal) */
1428 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001429 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001430PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1431 PyObject *unicode, /* Unicode object */
1432 PyObject *mapping, /* character mapping
1433 (unicode ordinal -> char ordinal) */
1434 const char *errors /* error handling */
1435 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001436#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001437
1438/* Translate a Py_UNICODE buffer of the given length by applying a
1439 character mapping table to it and return the resulting Unicode
1440 object.
1441
1442 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001444
1445 Mapping tables may be dictionaries or sequences. Unmapped character
1446 ordinals (ones which cause a LookupError) are left untouched and
1447 are copied as-is.
1448
1449*/
1450
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001451#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001452PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453 const Py_UNICODE *data, /* Unicode char buffer */
1454 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1455 PyObject *table, /* Translate table */
1456 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001457 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001458#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001459
Victor Stinner99b95382011-07-04 14:23:54 +02001460#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001461
Guido van Rossumefec1152000-03-28 02:01:15 +00001462/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001463
Mark Hammond91a681d2002-08-12 07:21:58 +00001464PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001465 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001466 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001467 const char *errors /* error handling */
1468 );
1469
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001470PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1471 const char *string, /* MBCS encoded string */
1472 Py_ssize_t length, /* size of string */
1473 const char *errors, /* error handling */
1474 Py_ssize_t *consumed /* bytes consumed */
1475 );
1476
Victor Stinner3a50e702011-10-18 21:21:00 +02001477PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1478 int code_page, /* code page number */
1479 const char *string, /* encoded string */
1480 Py_ssize_t length, /* size of string */
1481 const char *errors, /* error handling */
1482 Py_ssize_t *consumed /* bytes consumed */
1483 );
1484
Mark Hammond91a681d2002-08-12 07:21:58 +00001485PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001486 PyObject *unicode /* Unicode object */
1487 );
1488
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001489#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001490PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001491 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001492 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001493 const char *errors /* error handling */
1494 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001496
Victor Stinner3a50e702011-10-18 21:21:00 +02001497PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1498 int code_page, /* code page number */
1499 PyObject *unicode, /* Unicode object */
1500 const char *errors /* error handling */
1501 );
1502
Victor Stinner99b95382011-07-04 14:23:54 +02001503#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001504
Guido van Rossum9e896b32000-04-05 20:11:21 +00001505/* --- Decimal Encoder ---------------------------------------------------- */
1506
1507/* Takes a Unicode string holding a decimal value and writes it into
1508 an output buffer using standard ASCII digit codes.
1509
1510 The output buffer has to provide at least length+1 bytes of storage
1511 area. The output string is 0-terminated.
1512
1513 The encoder converts whitespace to ' ', decimal characters to their
1514 corresponding ASCII digit and all other Latin-1 characters except
1515 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1516 are treated as errors. This includes embedded NULL bytes.
1517
1518 Error handling is defined by the errors argument:
1519
1520 NULL or "strict": raise a ValueError
1521 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001522 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001523 "replace": replaces illegal characters with '?'
1524
1525 Returns 0 on success, -1 on failure.
1526
1527*/
1528
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001529#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001530PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 Py_UNICODE *s, /* Unicode buffer */
1532 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1533 char *output, /* Output buffer; must have size >= length */
1534 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001535 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001536#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001537
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001538/* Transforms code points that have decimal digit property to the
1539 corresponding ASCII digit code points.
1540
1541 Returns a new Unicode string on success, NULL on failure.
1542*/
1543
Georg Brandlb5503082010-12-05 11:40:48 +00001544#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001545PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1546 Py_UNICODE *s, /* Unicode buffer */
1547 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1548 );
Georg Brandlb5503082010-12-05 11:40:48 +00001549#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1552 as argument instead of a raw buffer and length. This function additionally
1553 transforms spaces to ASCII because this is what the callers in longobject,
1554 floatobject, and complexobject did anyways. */
1555
1556#ifndef Py_LIMITED_API
1557PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1558 PyObject *unicode /* Unicode object */
1559 );
1560#endif
1561
Martin v. Löwis011e8422009-05-05 04:43:17 +00001562/* --- File system encoding ---------------------------------------------- */
1563
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001564/* ParseTuple converter: encode str objects to bytes using
1565 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001566
1567PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1568
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001569/* ParseTuple converter: decode bytes objects to unicode using
1570 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1571
1572PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1573
Victor Stinner77c38622010-05-14 15:58:55 +00001574/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1575 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001576
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001577 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1578 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001579
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001580 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001581*/
1582
1583PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1584 const char *s /* encoded string */
1585 );
1586
Victor Stinner77c38622010-05-14 15:58:55 +00001587/* Decode a string using Py_FileSystemDefaultEncoding
1588 and the "surrogateescape" error handler.
1589
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001590 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1591 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001592*/
1593
Martin v. Löwis011e8422009-05-05 04:43:17 +00001594PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1595 const char *s, /* encoded string */
1596 Py_ssize_t size /* size */
1597 );
1598
Victor Stinnerae6265f2010-05-15 16:27:27 +00001599/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001600 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001601
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001602 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1603 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001604*/
1605
1606PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1607 PyObject *unicode
1608 );
1609
Guido van Rossumd8225182000-03-10 22:33:05 +00001610/* --- Methods & Slots ----------------------------------------------------
1611
1612 These are capable of handling Unicode objects and strings on input
1613 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001614 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001615
1616/* Concat two strings giving a new Unicode string. */
1617
Mark Hammond91a681d2002-08-12 07:21:58 +00001618PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001619 PyObject *left, /* Left string */
1620 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001621 );
1622
Walter Dörwald1ab83302007-05-18 17:15:44 +00001623/* Concat two strings and put the result in *pleft
1624 (sets *pleft to NULL on error) */
1625
1626PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 PyObject **pleft, /* Pointer to left string */
1628 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001629 );
1630
1631/* Concat two strings, put the result in *pleft and drop the right object
1632 (sets *pleft to NULL on error) */
1633
1634PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 PyObject **pleft, /* Pointer to left string */
1636 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001637 );
1638
Guido van Rossumd8225182000-03-10 22:33:05 +00001639/* Split a string giving a list of Unicode strings.
1640
1641 If sep is NULL, splitting will be done at all whitespace
1642 substrings. Otherwise, splits occur at the given separator.
1643
1644 At most maxsplit splits will be done. If negative, no limit is set.
1645
1646 Separators are not included in the resulting list.
1647
1648*/
1649
Mark Hammond91a681d2002-08-12 07:21:58 +00001650PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 PyObject *s, /* String to split */
1652 PyObject *sep, /* String separator */
1653 Py_ssize_t maxsplit /* Maxsplit count */
1654 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001655
1656/* Dito, but split at line breaks.
1657
1658 CRLF is considered to be one line break. Line breaks are not
1659 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660
Mark Hammond91a681d2002-08-12 07:21:58 +00001661PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 PyObject *s, /* String to split */
1663 int keepends /* If true, line end markers are included */
1664 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001665
Thomas Wouters477c8d52006-05-27 19:21:47 +00001666/* Partition a string using a given separator. */
1667
1668PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001669 PyObject *s, /* String to partition */
1670 PyObject *sep /* String separator */
1671 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001672
1673/* Partition a string using a given separator, searching from the end of the
1674 string. */
1675
1676PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 PyObject *s, /* String to partition */
1678 PyObject *sep /* String separator */
1679 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001680
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001681/* Split a string giving a list of Unicode strings.
1682
1683 If sep is NULL, splitting will be done at all whitespace
1684 substrings. Otherwise, splits occur at the given separator.
1685
1686 At most maxsplit splits will be done. But unlike PyUnicode_Split
1687 PyUnicode_RSplit splits from the end of the string. If negative,
1688 no limit is set.
1689
1690 Separators are not included in the resulting list.
1691
1692*/
1693
1694PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001695 PyObject *s, /* String to split */
1696 PyObject *sep, /* String separator */
1697 Py_ssize_t maxsplit /* Maxsplit count */
1698 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001699
Guido van Rossumd8225182000-03-10 22:33:05 +00001700/* Translate a string by applying a character mapping table to it and
1701 return the resulting Unicode object.
1702
1703 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001704 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001705
1706 Mapping tables may be dictionaries or sequences. Unmapped character
1707 ordinals (ones which cause a LookupError) are left untouched and
1708 are copied as-is.
1709
1710*/
1711
Mark Hammond91a681d2002-08-12 07:21:58 +00001712PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001713 PyObject *str, /* String */
1714 PyObject *table, /* Translate table */
1715 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001716 );
1717
1718/* Join a sequence of strings using the given separator and return
1719 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001720
Mark Hammond91a681d2002-08-12 07:21:58 +00001721PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001722 PyObject *separator, /* Separator string */
1723 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001724 );
1725
1726/* Return 1 if substr matches str[start:end] at the given tail end, 0
1727 otherwise. */
1728
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001730 PyObject *str, /* String */
1731 PyObject *substr, /* Prefix or Suffix string */
1732 Py_ssize_t start, /* Start index */
1733 Py_ssize_t end, /* Stop index */
1734 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001735 );
1736
1737/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001738 given search direction or -1 if not found. -2 is returned in case
1739 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001740
Martin v. Löwis18e16552006-02-15 17:27:45 +00001741PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001742 PyObject *str, /* String */
1743 PyObject *substr, /* Substring to find */
1744 Py_ssize_t start, /* Start index */
1745 Py_ssize_t end, /* Stop index */
1746 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001747 );
1748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749/* Like PyUnicode_Find, but search for single character only. */
1750PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1751 PyObject *str,
1752 Py_UCS4 ch,
1753 Py_ssize_t start,
1754 Py_ssize_t end,
1755 int direction
1756 );
1757
Barry Warsaw51ac5802000-03-20 16:36:48 +00001758/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001759
Martin v. Löwis18e16552006-02-15 17:27:45 +00001760PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001761 PyObject *str, /* String */
1762 PyObject *substr, /* Substring to count */
1763 Py_ssize_t start, /* Start index */
1764 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001765 );
1766
Barry Warsaw51ac5802000-03-20 16:36:48 +00001767/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001768 and return the resulting Unicode object. */
1769
Mark Hammond91a681d2002-08-12 07:21:58 +00001770PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001771 PyObject *str, /* String */
1772 PyObject *substr, /* Substring to find */
1773 PyObject *replstr, /* Substring to replace */
1774 Py_ssize_t maxcount /* Max. number of replacements to apply;
1775 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001776 );
1777
1778/* Compare two strings and return -1, 0, 1 for less than, equal,
1779 greater than resp. */
1780
Mark Hammond91a681d2002-08-12 07:21:58 +00001781PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001782 PyObject *left, /* Left string */
1783 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001784 );
1785
Martin v. Löwis5b222132007-06-10 09:51:05 +00001786PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1787 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001788 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001789 );
1790
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001791/* Rich compare two strings and return one of the following:
1792
1793 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001794 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001795 - Py_NotImplemented in case the type combination is unknown
1796
1797 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1798 case the conversion of the arguments to Unicode fails with a
1799 UnicodeDecodeError.
1800
1801 Possible values for op:
1802
1803 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1804
1805*/
1806
1807PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001808 PyObject *left, /* Left string */
1809 PyObject *right, /* Right string */
1810 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001811 );
1812
Thomas Wouters7e474022000-07-16 12:04:32 +00001813/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001814 the resulting Unicode string. */
1815
Mark Hammond91a681d2002-08-12 07:21:58 +00001816PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001817 PyObject *format, /* Format string */
1818 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001819 );
1820
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001821/* Checks whether element is contained in container and return 1/0
1822 accordingly.
1823
1824 element has to coerce to an one element Unicode string. -1 is
1825 returned in case of an error. */
1826
Mark Hammond91a681d2002-08-12 07:21:58 +00001827PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001828 PyObject *container, /* Container string */
1829 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001830 );
1831
Martin v. Löwis47383402007-08-15 07:32:56 +00001832/* Checks whether argument is a valid identifier. */
1833
1834PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1835
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001836#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001837/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001838PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001839 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001840 int striptype,
1841 PyObject *sepobj
1842 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001843#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001844
Eric Smith5807c412008-05-11 21:00:57 +00001845/* Using the current locale, insert the thousands grouping
1846 into the string pointed to by buffer. For the argument descriptions,
1847 see Objects/stringlib/localeutil.h */
1848
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001849#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001850PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1851 Py_ssize_t n_buffer,
1852 Py_UNICODE *digits,
1853 Py_ssize_t n_digits,
1854 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001855#endif
Eric Smith5807c412008-05-11 21:00:57 +00001856
Eric Smitha3b1ac82009-04-03 14:45:06 +00001857/* Using explicit passed-in values, insert the thousands grouping
1858 into the string pointed to by buffer. For the argument descriptions,
1859 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001860#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001862 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 int kind,
1864 void *buffer,
1865 Py_ssize_t n_buffer,
1866 void *digits,
1867 Py_ssize_t n_digits,
1868 Py_ssize_t min_width,
1869 const char *grouping,
1870 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001871#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001872/* === Characters Type APIs =============================================== */
1873
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001874/* Helper array used by Py_UNICODE_ISSPACE(). */
1875
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001876#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001877PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1878
Guido van Rossumd8225182000-03-10 22:33:05 +00001879/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001880 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001881
1882 These APIs are implemented in Objects/unicodectype.c.
1883
1884*/
1885
Mark Hammond91a681d2002-08-12 07:21:58 +00001886PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001887 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001888 );
1889
Mark Hammond91a681d2002-08-12 07:21:58 +00001890PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001891 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001892 );
1893
Mark Hammond91a681d2002-08-12 07:21:58 +00001894PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001895 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001896 );
1897
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001898PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001899 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001900 );
1901
1902PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001903 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001904 );
1905
Mark Hammond91a681d2002-08-12 07:21:58 +00001906PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001907 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001908 );
1909
Mark Hammond91a681d2002-08-12 07:21:58 +00001910PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001911 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001912 );
1913
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001914PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1915 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001916 );
1917
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001918PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1919 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001920 );
1921
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001922PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1923 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001924 );
1925
Mark Hammond91a681d2002-08-12 07:21:58 +00001926PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001928 );
1929
Mark Hammond91a681d2002-08-12 07:21:58 +00001930PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001931 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
Mark Hammond91a681d2002-08-12 07:21:58 +00001934PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001935 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Mark Hammond91a681d2002-08-12 07:21:58 +00001938PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001939 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Mark Hammond91a681d2002-08-12 07:21:58 +00001942PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001943 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
Mark Hammond91a681d2002-08-12 07:21:58 +00001946PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001947 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001948 );
1949
Georg Brandl559e5d72008-06-11 18:37:52 +00001950PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001951 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001952 );
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001956 );
1957
Victor Stinneref8d95c2010-08-16 22:03:11 +00001958PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1959 const Py_UNICODE *u
1960 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001961
1962PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001963 Py_UNICODE *s1,
1964 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001965
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001966PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1967 Py_UNICODE *s1, const Py_UNICODE *s2);
1968
Martin v. Löwis5b222132007-06-10 09:51:05 +00001969PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001970 Py_UNICODE *s1,
1971 const Py_UNICODE *s2,
1972 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001973
1974PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001975 const Py_UNICODE *s1,
1976 const Py_UNICODE *s2
1977 );
1978
1979PyAPI_FUNC(int) Py_UNICODE_strncmp(
1980 const Py_UNICODE *s1,
1981 const Py_UNICODE *s2,
1982 size_t n
1983 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001984
1985PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001986 const Py_UNICODE *s,
1987 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001988 );
1989
Victor Stinner331ea922010-08-10 16:37:20 +00001990PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001991 const Py_UNICODE *s,
1992 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001993 );
1994
Victor Stinner71133ff2010-09-01 23:43:53 +00001995/* Create a copy of a unicode string ending with a nul character. Return NULL
1996 and raise a MemoryError exception on memory allocation failure, otherwise
1997 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1998
Victor Stinner46408602010-09-03 16:18:00 +00001999PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002000 PyObject *unicode
2001 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002002#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002003
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002004#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002005PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002006 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002007 int check_content);
2008#endif
2009
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002010/********************* String Literals ****************************************/
2011/* This structure helps managing static strings. The basic usage goes like this:
2012 Instead of doing
2013
2014 r = PyObject_CallMethod(o, "foo", "args", ...);
2015
2016 do
2017
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002018 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002019 ...
2020 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2021
2022 PyId_foo is a static variable, either on block level or file level. On first
2023 usage, the string "foo" is interned, and the structures are linked. On interpreter
2024 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2025
2026 Alternatively, _Py_static_string allows to choose the variable name.
2027 _PyUnicode_FromId returns a new reference to the interned string.
2028 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2029*/
2030typedef struct _Py_Identifier {
2031 struct _Py_Identifier *next;
2032 const char* string;
2033 PyObject *object;
2034} _Py_Identifier;
2035
Martin v. Löwis87da8722011-10-09 11:54:42 +02002036#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002037#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002038
2039/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2040PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2041/* Clear all static strings. */
2042PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2043
Guido van Rossumd8225182000-03-10 22:33:05 +00002044#ifdef __cplusplus
2045}
2046#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002047#endif /* !Py_UNICODEOBJECT_H */