blob: e1a5a2f5f7da7329b3222f3dd5c6c36f8976692e [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000191/* Check if substring matches at given offset. The offset must be
192 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200195 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
196 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
197 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
198
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000199#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000200
Barry Warsaw51ac5802000-03-20 16:36:48 +0000201#ifdef __cplusplus
202extern "C" {
203#endif
204
Guido van Rossumd8225182000-03-10 22:33:05 +0000205/* --- Unicode Type ------------------------------------------------------- */
206
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000207#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200208
209/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
210 structure. state.ascii and state.compact are set, and the data
211 immediately follow the structure. utf8_length and wstr_length can be found
212 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000213typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200214 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200215
216 - compact ascii:
217
218 * structure = PyASCIIObject
219 * kind = PyUnicode_1BYTE_KIND
220 * compact = 1
221 * ascii = 1
222 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200223 * (length is the length of the utf8 and wstr strings)
224 * (data starts just after the structure)
225 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200226
227 - compact:
228
229 * structure = PyCompactUnicodeObject
230 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
231 PyUnicode_4BYTE_KIND
232 * compact = 1
233 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200234 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200235 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200236 * utf8_length = 0 if utf8 is NULL
237 * wstr is shared with data and wstr_length=length
238 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100239 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200240 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200242
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200243 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200244
245 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100246 * length = 0 (use wstr_length)
247 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200248 * kind = PyUnicode_WCHAR_KIND
249 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200250 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200251 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100252 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200253 * wstr is not NULL
254 * data.any is NULL
255 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200256 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200257
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200258 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200259
260 * structure = PyUnicodeObject structure
261 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
262 PyUnicode_4BYTE_KIND
263 * compact = 0
264 * ready = 1
265 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200266 * utf8 is shared and utf8_length = length with data.any if ascii = 1
267 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100268 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200269 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
270 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
271 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200272
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200273 Compact strings use only one memory block (structure + characters),
274 whereas legacy strings use one block for the structure and one block
275 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200276
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200277 Legacy strings are created by PyUnicode_FromUnicode() and
278 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
279 when PyUnicode_READY() is called.
280
281 See also _PyUnicode_CheckConsistency().
282 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000283 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200284 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000285 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286 struct {
287 /*
288 SSTATE_NOT_INTERNED (0)
289 SSTATE_INTERNED_MORTAL (1)
290 SSTATE_INTERNED_IMMORTAL (2)
291
292 If interned != SSTATE_NOT_INTERNED, the two references from the
293 dictionary to this object are *not* counted in ob_refcnt.
294 */
295 unsigned int interned:2;
296 /* Character size:
297
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200298 - PyUnicode_WCHAR_KIND (0):
299
300 * character type = wchar_t (16 or 32 bits, depending on the
301 platform)
302
303 - PyUnicode_1BYTE_KIND (1):
304
305 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100306 * all characters are in the range U+0000-U+00FF (latin1)
307 * if ascii is set, all characters are in the range U+0000-U+007F
308 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200309 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200310
311 - PyUnicode_2BYTE_KIND (2):
312
313 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+FFFF (BMP)
315 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200316
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200317 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100320 * all characters are in the range U+0000-U+10FFFF
321 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200323 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 /* Compact is with respect to the allocation scheme. Compact unicode
325 objects only require one memory block while non-compact objects use
326 one block for the PyUnicodeObject struct and another for its data
327 buffer. */
328 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100329 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200330 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
331 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 unsigned int ascii:1;
333 /* The ready flag indicates whether the object layout is initialized
334 completely. This means that this is either a compact object, or
335 the data pointer is filled out. The bit is redundant, and helps
336 to minimize the test in PyUnicode_IS_READY(). */
337 unsigned int ready:1;
338 } state;
339 wchar_t *wstr; /* wchar_t representation (null-terminated) */
340} PyASCIIObject;
341
342/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200343 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344 immediately follow the structure. */
345typedef struct {
346 PyASCIIObject _base;
347 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
348 * terminating \0. */
349 char *utf8; /* UTF-8 representation (null-terminated) */
350 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
351 * surrogates count as two code points. */
352} PyCompactUnicodeObject;
353
354/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
355 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200356 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357typedef struct {
358 PyCompactUnicodeObject _base;
359 union {
360 void *any;
361 Py_UCS1 *latin1;
362 Py_UCS2 *ucs2;
363 Py_UCS4 *ucs4;
364 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000365} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000366#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000367
Mark Hammond91a681d2002-08-12 07:21:58 +0000368PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000369PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000370
Thomas Wouters27d517b2007-02-25 20:39:11 +0000371#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000372 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
373#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000374
375/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000376#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377
378#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200379 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 ((PyASCIIObject*)op)->length : \
381 ((PyCompactUnicodeObject*)op)->wstr_length)
382
383/* Returns the deprecated Py_UNICODE representation's size in code units
384 (this includes surrogate pairs as 2 units).
385 If the Py_UNICODE representation is not available, it will be computed
386 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
387
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100388#define PyUnicode_GET_SIZE(op) \
389 (assert(PyUnicode_Check(op)), \
390 (((PyASCIIObject *)(op))->wstr) ? \
391 PyUnicode_WSTR_LENGTH(op) : \
392 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
393 assert(((PyASCIIObject *)(op))->wstr), \
394 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200395
Guido van Rossumd8225182000-03-10 22:33:05 +0000396#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200397 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
398
399/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
400 representation on demand. Using this macro is very inefficient now,
401 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
402 use PyUnicode_WRITE() and PyUnicode_READ(). */
403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 (assert(PyUnicode_Check(op)), \
406 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
407 PyUnicode_AsUnicode((PyObject *)(op)))
408
Guido van Rossumd8225182000-03-10 22:33:05 +0000409#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410 ((const char *)(PyUnicode_AS_UNICODE(op)))
411
412
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200413/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414
Victor Stinner6f9568b2011-11-17 00:12:44 +0100415/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200416
417/* Interning state. */
418#define SSTATE_NOT_INTERNED 0
419#define SSTATE_INTERNED_MORTAL 1
420#define SSTATE_INTERNED_IMMORTAL 2
421
Victor Stinnera3b334d2011-10-03 13:53:37 +0200422/* Return true if the string contains only ASCII characters, or 0 if not. The
423 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
424 or Ready calls are performed. */
425#define PyUnicode_IS_ASCII(op) \
426 (((PyASCIIObject*)op)->state.ascii)
427
428/* Return true if the string is compact or 0 if not.
429 No type checks or Ready calls are performed. */
430#define PyUnicode_IS_COMPACT(op) \
431 (((PyASCIIObject*)(op))->state.compact)
432
433/* Return true if the string is a compact ASCII string (use PyASCIIObject
434 structure), or 0 if not. No type checks or Ready calls are performed. */
435#define PyUnicode_IS_COMPACT_ASCII(op) \
436 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200437
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200438enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200440 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200442 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200444 PyUnicode_1BYTE_KIND = 1,
445 PyUnicode_2BYTE_KIND = 2,
446 PyUnicode_4BYTE_KIND = 4
447};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200448
Georg Brandl4975a9b2011-10-05 16:12:21 +0200449/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200451 No checks are performed, use PyUnicode_KIND() before to ensure
452 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453
454#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
455#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
456#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
457
Victor Stinner157f83f2011-09-28 21:41:31 +0200458/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459#define PyUnicode_KIND(op) \
460 (assert(PyUnicode_Check(op)), \
461 assert(PyUnicode_IS_READY(op)), \
462 ((PyASCIIObject *)(op))->state.kind)
463
Victor Stinner157f83f2011-09-28 21:41:31 +0200464/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200465#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200466 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200467 ((void*)((PyASCIIObject*)(op) + 1)) : \
468 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
469
470#define _PyUnicode_NONCOMPACT_DATA(op) \
471 (assert(((PyUnicodeObject*)(op))->data.any), \
472 ((((PyUnicodeObject *)(op))->data.any)))
473
474#define PyUnicode_DATA(op) \
475 (assert(PyUnicode_Check(op)), \
476 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
477 _PyUnicode_NONCOMPACT_DATA(op))
478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200479/* In the access macros below, "kind" may be evaluated more than once.
480 All other macro parameters are evaluated exactly once, so it is safe
481 to put side effects into them (such as increasing the index). */
482
483/* Write into the canonical representation, this macro does not do any sanity
484 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200485 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200486 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200487 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200488#define PyUnicode_WRITE(kind, data, index, value) \
489 do { \
490 switch ((kind)) { \
491 case PyUnicode_1BYTE_KIND: { \
492 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
493 break; \
494 } \
495 case PyUnicode_2BYTE_KIND: { \
496 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
497 break; \
498 } \
499 default: { \
500 assert((kind) == PyUnicode_4BYTE_KIND); \
501 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
502 } \
503 } \
504 } while (0)
505
Georg Brandl07de3252011-10-05 16:47:38 +0200506/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507 or ready calls are performed. */
508#define PyUnicode_READ(kind, data, index) \
509 ((Py_UCS4) \
510 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200511 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200512 ((kind) == PyUnicode_2BYTE_KIND ? \
513 ((const Py_UCS2 *)(data))[(index)] : \
514 ((const Py_UCS4 *)(data))[(index)] \
515 ) \
516 ))
517
518/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
519 calls PyUnicode_KIND() and might call it twice. For single reads, use
520 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
521 cache kind and use PyUnicode_READ instead. */
522#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200523 (assert(PyUnicode_Check(unicode)), \
524 assert(PyUnicode_IS_READY(unicode)), \
525 (Py_UCS4) \
526 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
527 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
528 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
529 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
530 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
531 ) \
532 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533
534/* Returns the length of the unicode string. The caller has to make sure that
535 the string has it's canonical representation set before calling
536 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
537#define PyUnicode_GET_LENGTH(op) \
538 (assert(PyUnicode_Check(op)), \
539 assert(PyUnicode_IS_READY(op)), \
540 ((PyASCIIObject *)(op))->length)
541
542
543/* Fast check to determine whether an object is ready. Equivalent to
544 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
545
546#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
547
Victor Stinnera3b334d2011-10-03 13:53:37 +0200548/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200549 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200550 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 Returns 0 on success and -1 on errors. */
552#define PyUnicode_READY(op) \
553 (assert(PyUnicode_Check(op)), \
554 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200555 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557/* Return a maximum character value which is suitable for creating another
558 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200559 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560#define PyUnicode_MAX_CHAR_VALUE(op) \
561 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200562 (PyUnicode_IS_ASCII(op) ? \
563 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200565 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200567 (0xffffU) : \
568 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000570#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000571
572/* --- Constants ---------------------------------------------------------- */
573
574/* This Unicode character will be used as replacement character during
575 decoding if the errors argument is set to "replace". Note: the
576 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
577 Unicode 3.0. */
578
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200579#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000580
581/* === Public API ========================================================= */
582
583/* --- Plain Py_UNICODE --------------------------------------------------- */
584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200585/* With PEP 393, this is the recommended way to allocate a new unicode object.
586 This function will allocate the object and its buffer in a single memory
587 block. Objects created using this function are not resizable. */
588#ifndef Py_LIMITED_API
589PyAPI_FUNC(PyObject*) PyUnicode_New(
590 Py_ssize_t size, /* Number of code points in the new string */
591 Py_UCS4 maxchar /* maximum code point value in the string */
592 );
593#endif
594
Victor Stinnerd8f65102011-09-29 19:43:17 +0200595/* Initializes the canonical string representation from a the deprecated
596 wstr/Py_UNICODE representation. This function is used to convert Unicode
597 objects which were created using the old API to the new flexible format
598 introduced with PEP 393.
599
600 Don't call this function directly, use the public PyUnicode_READY() macro
601 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602#ifndef Py_LIMITED_API
603PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200604 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605 );
606#endif
607
Victor Stinner034f6cf2011-09-30 02:26:44 +0200608/* Get a copy of a Unicode string. */
609PyAPI_FUNC(PyObject*) PyUnicode_Copy(
610 PyObject *unicode
611 );
612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200613/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 character conversion when necessary and falls back to memcpy if possible.
615
Victor Stinnera0702ab2011-09-29 14:14:38 +0200616 Fail if to is too small (smaller than how_many or smaller than
617 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
618 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200619
620 Return the number of written character, or return -1 and raise an exception
621 on error.
622
623 Pseudo-code:
624
625 how_many = min(how_many, len(from) - from_start)
626 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
627 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200628
629 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200630 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200631#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200632PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633 PyObject *to,
634 Py_ssize_t to_start,
635 PyObject *from,
636 Py_ssize_t from_start,
637 Py_ssize_t how_many
638 );
639#endif
640
Guido van Rossumd8225182000-03-10 22:33:05 +0000641/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000643
644 u may be NULL which causes the contents to be undefined. It is the
645 user's responsibility to fill in the needed data afterwards. Note
646 that modifying the Unicode object contents after construction is
647 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000648
649 The buffer is copied into the new object. */
650
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000651#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000652PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000653 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000654 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000655 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000656#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000657
Georg Brandl952867a2010-06-27 10:17:12 +0000658/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000659PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000660 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000661 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 );
663
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000664/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000666PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000667 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000668 );
669
Victor Stinnerb9275c12011-10-05 14:01:42 +0200670/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
671 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672#ifndef Py_LIMITED_API
673PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
674 int kind,
675 const void *buffer,
676 Py_ssize_t size);
677#endif
678
679PyAPI_FUNC(PyObject*) PyUnicode_Substring(
680 PyObject *str,
681 Py_ssize_t start,
682 Py_ssize_t end);
683
Georg Brandldb6c7f52011-10-07 11:19:11 +0200684/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685 is set. Return NULL and raise an exception on error. Raise a ValueError if
686 the buffer is smaller than the string. Return buffer on success.
687
688 buflen is the length of the buffer in (Py_UCS4) characters. */
689PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
690 PyObject *unicode,
691 Py_UCS4* buffer,
692 Py_ssize_t buflen,
693 int copy_null);
694
695/* Copy the string into a UCS4 buffer. A new buffer is allocated using
696 * PyMem_Malloc; if this fails, NULL is returned with a memory error
697 exception set. */
698PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
699
Guido van Rossumd8225182000-03-10 22:33:05 +0000700/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 Py_UNICODE buffer.
702 If the wchar_t/Py_UNICODE representation is not yet available, this
703 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000704
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000705#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000706PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000707 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000708 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000709#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711/* Return a read-only pointer to the Unicode object's internal
712 Py_UNICODE buffer and save the length at size.
713 If the wchar_t/Py_UNICODE representation is not yet available, this
714 function will calculate it. */
715
716#ifndef Py_LIMITED_API
717PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
718 PyObject *unicode, /* Unicode object */
719 Py_ssize_t *size /* location where to save the length */
720 );
721#endif
722
Guido van Rossumd8225182000-03-10 22:33:05 +0000723/* Get the length of the Unicode object. */
724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
726 PyObject *unicode
727);
728
Victor Stinner157f83f2011-09-28 21:41:31 +0200729/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730 string representation. */
731
Martin v. Löwis18e16552006-02-15 17:27:45 +0000732PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000734 );
735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736/* Read a character from the string. */
737
738PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
739 PyObject *unicode,
740 Py_ssize_t index
741 );
742
743/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200744 PyUnicode_New, must not be shared, and must not have been hashed yet.
745
746 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747
748PyAPI_FUNC(int) PyUnicode_WriteChar(
749 PyObject *unicode,
750 Py_ssize_t index,
751 Py_UCS4 character
752 );
753
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000754#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000755/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000756PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000757#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000758
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200759/* Resize an Unicode object allocated by the legacy API (e.g.
760 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
761 PyUnicode_New) cannot be resized by this function.
762
Victor Stinner93439992011-11-20 18:29:14 +0100763 The length is a number of characters (and not the number of Py_UNICODE characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000764
765 *unicode is modified to point to the new (resized) object and 0
766 returned on success.
767
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200768 If the refcount on the object is 1, the function resizes the string in
769 place, which is usually faster than allocating a new string (and copy
770 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000771
772 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200773 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000774
Mark Hammond91a681d2002-08-12 07:21:58 +0000775PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 PyObject **unicode, /* Pointer to the Unicode object */
777 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000778 );
779
Guido van Rossumd8225182000-03-10 22:33:05 +0000780/* Coerce obj to an Unicode object and return a reference with
781 *incremented* refcount.
782
783 Coercion is done in the following way:
784
Georg Brandl952867a2010-06-27 10:17:12 +0000785 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000786 under the assumptions that they contain data using the UTF-8
787 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000788
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000789 2. All other objects (including Unicode objects) raise an
790 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000791
792 The API returns NULL in case of an error. The caller is responsible
793 for decref'ing the returned objects.
794
795*/
796
Mark Hammond91a681d2002-08-12 07:21:58 +0000797PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000799 const char *encoding, /* encoding */
800 const char *errors /* error handling */
801 );
802
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000803/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000804 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000806 Unicode objects are passed back as-is (subclasses are converted to
807 true Unicode objects), all other objects are delegated to
808 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000809 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000810
811 The API returns NULL in case of an error. The caller is responsible
812 for decref'ing the returned objects.
813
814*/
815
Mark Hammond91a681d2002-08-12 07:21:58 +0000816PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000818 );
819
Victor Stinner1205f272010-09-11 00:54:47 +0000820PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
821 const char *format, /* ASCII-encoded string */
822 va_list vargs
823 );
824PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
825 const char *format, /* ASCII-encoded string */
826 ...
827 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000829#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000830/* Format the object based on the format_spec, as defined in PEP 3101
831 (Advanced String Formatting). */
832PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833 PyObject *format_spec,
834 Py_ssize_t start,
835 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000836#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000837
Walter Dörwald16807132007-05-25 13:52:07 +0000838PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
839PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000840PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
841 const char *u /* UTF-8 encoded string */
842 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000843#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000844PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000845#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000846
847/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848#define PyUnicode_CHECK_INTERNED(op) \
849 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000850
Guido van Rossumd8225182000-03-10 22:33:05 +0000851/* --- wchar_t support for platforms which support it --------------------- */
852
853#ifdef HAVE_WCHAR_H
854
Georg Brandl952867a2010-06-27 10:17:12 +0000855/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000856 size.
857
858 The buffer is copied into the new object. */
859
Mark Hammond91a681d2002-08-12 07:21:58 +0000860PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000861 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000863 );
864
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000865/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000866 most size wchar_t characters are copied.
867
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000868 Note that the resulting wchar_t string may or may not be
869 0-terminated. It is the responsibility of the caller to make sure
870 that the wchar_t string is 0-terminated in case this is required by
871 the application.
872
873 Returns the number of wchar_t characters copied (excluding a
874 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000875 error. */
876
Martin v. Löwis18e16552006-02-15 17:27:45 +0000877PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000878 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000879 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000880 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000881 );
882
Victor Stinner137c34c2010-09-29 10:25:54 +0000883/* Convert the Unicode object to a wide character string. The output string
884 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200885 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000886
887 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
888 on success. On error, returns NULL, *size is undefined and raises a
889 MemoryError. */
890
891PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000892 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000893 Py_ssize_t *size /* number of characters of the result */
894 );
895
Victor Stinner9f789e72011-10-01 03:57:28 +0200896#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200898#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899
Guido van Rossumd8225182000-03-10 22:33:05 +0000900#endif
901
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000902/* --- Unicode ordinals --------------------------------------------------- */
903
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904/* Create a Unicode Object from the given Unicode code point ordinal.
905
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000906 The ordinal must be in range(0x10000) on narrow Python builds
907 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
908 raised in case it is not.
909
910*/
911
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000912PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000913
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000914/* --- Free-list management ----------------------------------------------- */
915
916/* Clear the free list used by the Unicode implementation.
917
918 This can be used to release memory used for objects on the free
919 list back to the Python memory allocator.
920
921*/
922
923PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
924
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000925/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000926
927 Many of these APIs take two arguments encoding and errors. These
928 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000929 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000930
Georg Brandl952867a2010-06-27 10:17:12 +0000931 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000932
933 Error handling is set by errors which may also be set to NULL
934 meaning to use the default handling defined for the codec. Default
935 error handling for all builtin codecs is "strict" (ValueErrors are
936 raised).
937
938 The codecs all use a similar interface. Only deviation from the
939 generic ones are documented.
940
941*/
942
Fred Drakecb093fe2000-05-09 19:51:53 +0000943/* --- Manage the default encoding ---------------------------------------- */
944
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000945/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000946 Unicode object unicode and the size of the encoded representation
947 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000948
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000949 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000950
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200951 This function caches the UTF-8 encoded string in the unicodeobject
952 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953 when the unicodeobject is deallocated.
954
955 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
956 support the previous internal function with the same behaviour.
957
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000958 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000959 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000960
961 *** If you need to access the Unicode object as UTF-8 bytes string,
962 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000963*/
964
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000965#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000967 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000968 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000970#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000971
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000972/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000973 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
976 in the unicodeobject.
977
978 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
979 support the previous internal function with the same behaviour.
980
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000981 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000982 extracted from the returned data.
983
984 *** This API is for interpreter INTERNAL USE ONLY and will likely
985 *** be removed or changed for Python 3.1.
986
987 *** If you need to access the Unicode object as UTF-8 bytes string,
988 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000989
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000990*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000991
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000992#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
994#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000995#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000996
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000997/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000998
Mark Hammond91a681d2002-08-12 07:21:58 +0000999PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001000
Guido van Rossumd8225182000-03-10 22:33:05 +00001001/* --- Generic Codecs ----------------------------------------------------- */
1002
1003/* Create a Unicode object by decoding the encoded string s of the
1004 given size. */
1005
Mark Hammond91a681d2002-08-12 07:21:58 +00001006PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001007 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001009 const char *encoding, /* encoding */
1010 const char *errors /* error handling */
1011 );
1012
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001013/* Decode a Unicode object unicode and return the result as Python
1014 object. */
1015
1016PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 PyObject *unicode, /* Unicode object */
1018 const char *encoding, /* encoding */
1019 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001020 );
1021
1022/* Decode a Unicode object unicode and return the result as Unicode
1023 object. */
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001026 PyObject *unicode, /* Unicode object */
1027 const char *encoding, /* encoding */
1028 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001029 );
1030
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001031/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001032 Python string object. */
1033
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001034#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001035PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001038 const char *encoding, /* encoding */
1039 const char *errors /* error handling */
1040 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001041#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001042
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001043/* Encodes a Unicode object and returns the result as Python
1044 object. */
1045
1046PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047 PyObject *unicode, /* Unicode object */
1048 const char *encoding, /* encoding */
1049 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001050 );
1051
Guido van Rossumd8225182000-03-10 22:33:05 +00001052/* Encodes a Unicode object and returns the result as Python string
1053 object. */
1054
Mark Hammond91a681d2002-08-12 07:21:58 +00001055PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 PyObject *unicode, /* Unicode object */
1057 const char *encoding, /* encoding */
1058 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001059 );
1060
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001061/* Encodes a Unicode object and returns the result as Unicode
1062 object. */
1063
1064PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 PyObject *unicode, /* Unicode object */
1066 const char *encoding, /* encoding */
1067 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001068 );
1069
1070/* Build an encoding map. */
1071
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001072PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1073 PyObject* string /* 256 character map */
1074 );
1075
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076/* --- UTF-7 Codecs ------------------------------------------------------- */
1077
Mark Hammond91a681d2002-08-12 07:21:58 +00001078PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 const char *string, /* UTF-7 encoded string */
1080 Py_ssize_t length, /* size of string */
1081 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082 );
1083
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001084PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 const char *string, /* UTF-7 encoded string */
1086 Py_ssize_t length, /* size of string */
1087 const char *errors, /* error handling */
1088 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001089 );
1090
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001091#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001092PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001093 const Py_UNICODE *data, /* Unicode char buffer */
1094 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1095 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1096 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1097 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001098 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001099PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1100 PyObject *unicode, /* Unicode object */
1101 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1102 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1103 const char *errors /* error handling */
1104 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001105#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001106
Guido van Rossumd8225182000-03-10 22:33:05 +00001107/* --- UTF-8 Codecs ------------------------------------------------------- */
1108
Mark Hammond91a681d2002-08-12 07:21:58 +00001109PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 const char *string, /* UTF-8 encoded string */
1111 Py_ssize_t length, /* size of string */
1112 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001113 );
1114
Walter Dörwald69652032004-09-07 20:24:22 +00001115PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 const char *string, /* UTF-8 encoded string */
1117 Py_ssize_t length, /* size of string */
1118 const char *errors, /* error handling */
1119 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001120 );
1121
Mark Hammond91a681d2002-08-12 07:21:58 +00001122PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001124 );
1125
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001126#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1128 PyObject *unicode,
1129 const char *errors);
1130
Mark Hammond91a681d2002-08-12 07:21:58 +00001131PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001132 const Py_UNICODE *data, /* Unicode char buffer */
1133 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1134 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001135 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001136#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001137
Walter Dörwald41980ca2007-08-16 21:55:45 +00001138/* --- UTF-32 Codecs ------------------------------------------------------ */
1139
1140/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1141 the corresponding Unicode object.
1142
1143 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001145
1146 If byteorder is non-NULL, the decoder starts decoding using the
1147 given byte order:
1148
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 *byteorder == -1: little endian
1150 *byteorder == 0: native order
1151 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001152
1153 In native mode, the first four bytes of the stream are checked for a
1154 BOM mark. If found, the BOM mark is analysed, the byte order
1155 adjusted and the BOM skipped. In the other modes, no BOM mark
1156 interpretation is done. After completion, *byteorder is set to the
1157 current byte order at the end of input data.
1158
1159 If byteorder is NULL, the codec starts in native order mode.
1160
1161*/
1162
1163PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 const char *string, /* UTF-32 encoded string */
1165 Py_ssize_t length, /* size of string */
1166 const char *errors, /* error handling */
1167 int *byteorder /* pointer to byteorder to use
1168 0=native;-1=LE,1=BE; updated on
1169 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001170 );
1171
1172PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 const char *string, /* UTF-32 encoded string */
1174 Py_ssize_t length, /* size of string */
1175 const char *errors, /* error handling */
1176 int *byteorder, /* pointer to byteorder to use
1177 0=native;-1=LE,1=BE; updated on
1178 exit */
1179 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001180 );
1181
1182/* Returns a Python string using the UTF-32 encoding in native byte
1183 order. The string always starts with a BOM mark. */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001187 );
1188
1189/* Returns a Python string object holding the UTF-32 encoded value of
1190 the Unicode data.
1191
1192 If byteorder is not 0, output is written according to the following
1193 byte order:
1194
1195 byteorder == -1: little endian
1196 byteorder == 0: native byte order (writes a BOM mark)
1197 byteorder == 1: big endian
1198
1199 If byteorder is 0, the output string will always start with the
1200 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1201 prepended.
1202
1203*/
1204
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001205#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001206PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 const Py_UNICODE *data, /* Unicode char buffer */
1208 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1209 const char *errors, /* error handling */
1210 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001211 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001212PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1213 PyObject *object, /* Unicode object */
1214 const char *errors, /* error handling */
1215 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1216 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001217#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001218
Guido van Rossumd8225182000-03-10 22:33:05 +00001219/* --- UTF-16 Codecs ------------------------------------------------------ */
1220
Guido van Rossum9e896b32000-04-05 20:11:21 +00001221/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001222 the corresponding Unicode object.
1223
1224 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001226
1227 If byteorder is non-NULL, the decoder starts decoding using the
1228 given byte order:
1229
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 *byteorder == -1: little endian
1231 *byteorder == 0: native order
1232 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001233
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001234 In native mode, the first two bytes of the stream are checked for a
1235 BOM mark. If found, the BOM mark is analysed, the byte order
1236 adjusted and the BOM skipped. In the other modes, no BOM mark
1237 interpretation is done. After completion, *byteorder is set to the
1238 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001239
1240 If byteorder is NULL, the codec starts in native order mode.
1241
1242*/
1243
Mark Hammond91a681d2002-08-12 07:21:58 +00001244PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 const char *string, /* UTF-16 encoded string */
1246 Py_ssize_t length, /* size of string */
1247 const char *errors, /* error handling */
1248 int *byteorder /* pointer to byteorder to use
1249 0=native;-1=LE,1=BE; updated on
1250 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001251 );
1252
Walter Dörwald69652032004-09-07 20:24:22 +00001253PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 const char *string, /* UTF-16 encoded string */
1255 Py_ssize_t length, /* size of string */
1256 const char *errors, /* error handling */
1257 int *byteorder, /* pointer to byteorder to use
1258 0=native;-1=LE,1=BE; updated on
1259 exit */
1260 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001261 );
1262
Guido van Rossumd8225182000-03-10 22:33:05 +00001263/* Returns a Python string using the UTF-16 encoding in native byte
1264 order. The string always starts with a BOM mark. */
1265
Mark Hammond91a681d2002-08-12 07:21:58 +00001266PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001268 );
1269
1270/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001271 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001272
1273 If byteorder is not 0, output is written according to the following
1274 byte order:
1275
1276 byteorder == -1: little endian
1277 byteorder == 0: native byte order (writes a BOM mark)
1278 byteorder == 1: big endian
1279
1280 If byteorder is 0, the output string will always start with the
1281 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1282 prepended.
1283
1284 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1285 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001286 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001287
1288*/
1289
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001290#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001291PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 const Py_UNICODE *data, /* Unicode char buffer */
1293 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1294 const char *errors, /* error handling */
1295 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001296 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001297PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1298 PyObject* unicode, /* Unicode object */
1299 const char *errors, /* error handling */
1300 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1301 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001302#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001303
1304/* --- Unicode-Escape Codecs ---------------------------------------------- */
1305
Mark Hammond91a681d2002-08-12 07:21:58 +00001306PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 const char *string, /* Unicode-Escape encoded string */
1308 Py_ssize_t length, /* size of string */
1309 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001310 );
1311
Mark Hammond91a681d2002-08-12 07:21:58 +00001312PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001313 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001314 );
1315
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001316#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001317PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 const Py_UNICODE *data, /* Unicode char buffer */
1319 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001320 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001321#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001322
1323/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1324
Mark Hammond91a681d2002-08-12 07:21:58 +00001325PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326 const char *string, /* Raw-Unicode-Escape encoded string */
1327 Py_ssize_t length, /* size of string */
1328 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001329 );
1330
Mark Hammond91a681d2002-08-12 07:21:58 +00001331PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001333 );
1334
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001335#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001336PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 const Py_UNICODE *data, /* Unicode char buffer */
1338 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001339 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001340#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001341
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001342/* --- Unicode Internal Codec ---------------------------------------------
1343
1344 Only for internal use in _codecsmodule.c */
1345
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001346#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001347PyObject *_PyUnicode_DecodeUnicodeInternal(
1348 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001349 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001350 const char *errors
1351 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001352#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001353
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001355
1356 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1357
1358*/
1359
Mark Hammond91a681d2002-08-12 07:21:58 +00001360PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001361 const char *string, /* Latin-1 encoded string */
1362 Py_ssize_t length, /* size of string */
1363 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001364 );
1365
Mark Hammond91a681d2002-08-12 07:21:58 +00001366PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001368 );
1369
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001370#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1372 PyObject* unicode,
1373 const char* errors);
1374
Mark Hammond91a681d2002-08-12 07:21:58 +00001375PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 const Py_UNICODE *data, /* Unicode char buffer */
1377 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1378 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001379 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001380#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001381
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001382/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001383
1384 Only 7-bit ASCII data is excepted. All other codes generate errors.
1385
1386*/
1387
Mark Hammond91a681d2002-08-12 07:21:58 +00001388PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001389 const char *string, /* ASCII encoded string */
1390 Py_ssize_t length, /* size of string */
1391 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001392 );
1393
Mark Hammond91a681d2002-08-12 07:21:58 +00001394PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001396 );
1397
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001398#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1400 PyObject* unicode,
1401 const char* errors);
1402
Mark Hammond91a681d2002-08-12 07:21:58 +00001403PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001404 const Py_UNICODE *data, /* Unicode char buffer */
1405 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1406 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001407 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001408#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001409
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001413
1414 Decoding mappings must map single string characters to single
1415 Unicode characters, integers (which are then interpreted as Unicode
1416 ordinals) or None (meaning "undefined mapping" and causing an
1417 error).
1418
1419 Encoding mappings must map single Unicode characters to single
1420 string characters, integers (which are then interpreted as Latin-1
1421 ordinals) or None (meaning "undefined mapping" and causing an
1422 error).
1423
1424 If a character lookup fails with a LookupError, the character is
1425 copied as-is meaning that its ordinal value will be interpreted as
1426 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1427 to contain those mappings which map characters to different code
1428 points.
1429
1430*/
1431
Mark Hammond91a681d2002-08-12 07:21:58 +00001432PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 const char *string, /* Encoded string */
1434 Py_ssize_t length, /* size of string */
1435 PyObject *mapping, /* character mapping
1436 (char ordinal -> unicode ordinal) */
1437 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001438 );
1439
Mark Hammond91a681d2002-08-12 07:21:58 +00001440PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 PyObject *unicode, /* Unicode object */
1442 PyObject *mapping /* character mapping
1443 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001444 );
1445
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001446#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001447PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001448 const Py_UNICODE *data, /* Unicode char buffer */
1449 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1450 PyObject *mapping, /* character mapping
1451 (unicode ordinal -> char ordinal) */
1452 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001453 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001454PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1455 PyObject *unicode, /* Unicode object */
1456 PyObject *mapping, /* character mapping
1457 (unicode ordinal -> char ordinal) */
1458 const char *errors /* error handling */
1459 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001460#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001461
1462/* Translate a Py_UNICODE buffer of the given length by applying a
1463 character mapping table to it and return the resulting Unicode
1464 object.
1465
1466 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001467 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001468
1469 Mapping tables may be dictionaries or sequences. Unmapped character
1470 ordinals (ones which cause a LookupError) are left untouched and
1471 are copied as-is.
1472
1473*/
1474
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001475#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001476PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 const Py_UNICODE *data, /* Unicode char buffer */
1478 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1479 PyObject *table, /* Translate table */
1480 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001481 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001482#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001483
Victor Stinner99b95382011-07-04 14:23:54 +02001484#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001485
Guido van Rossumefec1152000-03-28 02:01:15 +00001486/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001487
Mark Hammond91a681d2002-08-12 07:21:58 +00001488PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001489 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001490 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001491 const char *errors /* error handling */
1492 );
1493
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001494PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1495 const char *string, /* MBCS encoded string */
1496 Py_ssize_t length, /* size of string */
1497 const char *errors, /* error handling */
1498 Py_ssize_t *consumed /* bytes consumed */
1499 );
1500
Victor Stinner3a50e702011-10-18 21:21:00 +02001501PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1502 int code_page, /* code page number */
1503 const char *string, /* encoded string */
1504 Py_ssize_t length, /* size of string */
1505 const char *errors, /* error handling */
1506 Py_ssize_t *consumed /* bytes consumed */
1507 );
1508
Mark Hammond91a681d2002-08-12 07:21:58 +00001509PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001510 PyObject *unicode /* Unicode object */
1511 );
1512
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001513#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001514PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001515 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001516 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001517 const char *errors /* error handling */
1518 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001519#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001520
Victor Stinner3a50e702011-10-18 21:21:00 +02001521PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1522 int code_page, /* code page number */
1523 PyObject *unicode, /* Unicode object */
1524 const char *errors /* error handling */
1525 );
1526
Victor Stinner99b95382011-07-04 14:23:54 +02001527#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001528
Guido van Rossum9e896b32000-04-05 20:11:21 +00001529/* --- Decimal Encoder ---------------------------------------------------- */
1530
1531/* Takes a Unicode string holding a decimal value and writes it into
1532 an output buffer using standard ASCII digit codes.
1533
1534 The output buffer has to provide at least length+1 bytes of storage
1535 area. The output string is 0-terminated.
1536
1537 The encoder converts whitespace to ' ', decimal characters to their
1538 corresponding ASCII digit and all other Latin-1 characters except
1539 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1540 are treated as errors. This includes embedded NULL bytes.
1541
1542 Error handling is defined by the errors argument:
1543
1544 NULL or "strict": raise a ValueError
1545 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001547 "replace": replaces illegal characters with '?'
1548
1549 Returns 0 on success, -1 on failure.
1550
1551*/
1552
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001553#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001554PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 Py_UNICODE *s, /* Unicode buffer */
1556 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1557 char *output, /* Output buffer; must have size >= length */
1558 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001559 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001560#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001561
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001562/* Transforms code points that have decimal digit property to the
1563 corresponding ASCII digit code points.
1564
1565 Returns a new Unicode string on success, NULL on failure.
1566*/
1567
Georg Brandlb5503082010-12-05 11:40:48 +00001568#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001569PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1570 Py_UNICODE *s, /* Unicode buffer */
1571 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1572 );
Georg Brandlb5503082010-12-05 11:40:48 +00001573#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001574
Victor Stinner6f9568b2011-11-17 00:12:44 +01001575/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 as argument instead of a raw buffer and length. This function additionally
1577 transforms spaces to ASCII because this is what the callers in longobject,
1578 floatobject, and complexobject did anyways. */
1579
1580#ifndef Py_LIMITED_API
1581PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1582 PyObject *unicode /* Unicode object */
1583 );
1584#endif
1585
Martin v. Löwis011e8422009-05-05 04:43:17 +00001586/* --- File system encoding ---------------------------------------------- */
1587
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001588/* ParseTuple converter: encode str objects to bytes using
1589 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001590
1591PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1592
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001593/* ParseTuple converter: decode bytes objects to unicode using
1594 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1595
1596PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1597
Victor Stinner77c38622010-05-14 15:58:55 +00001598/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1599 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001600
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001601 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1602 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001603
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001604 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001605*/
1606
1607PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1608 const char *s /* encoded string */
1609 );
1610
Victor Stinner77c38622010-05-14 15:58:55 +00001611/* Decode a string using Py_FileSystemDefaultEncoding
1612 and the "surrogateescape" error handler.
1613
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001614 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1615 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001616*/
1617
Martin v. Löwis011e8422009-05-05 04:43:17 +00001618PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1619 const char *s, /* encoded string */
1620 Py_ssize_t size /* size */
1621 );
1622
Victor Stinnerae6265f2010-05-15 16:27:27 +00001623/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001624 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001625
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001626 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1627 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001628*/
1629
1630PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1631 PyObject *unicode
1632 );
1633
Guido van Rossumd8225182000-03-10 22:33:05 +00001634/* --- Methods & Slots ----------------------------------------------------
1635
1636 These are capable of handling Unicode objects and strings on input
1637 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001638 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001639
1640/* Concat two strings giving a new Unicode string. */
1641
Mark Hammond91a681d2002-08-12 07:21:58 +00001642PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001643 PyObject *left, /* Left string */
1644 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001645 );
1646
Walter Dörwald1ab83302007-05-18 17:15:44 +00001647/* Concat two strings and put the result in *pleft
1648 (sets *pleft to NULL on error) */
1649
1650PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 PyObject **pleft, /* Pointer to left string */
1652 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001653 );
1654
1655/* Concat two strings, put the result in *pleft and drop the right object
1656 (sets *pleft to NULL on error) */
1657
1658PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001659 PyObject **pleft, /* Pointer to left string */
1660 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001661 );
1662
Guido van Rossumd8225182000-03-10 22:33:05 +00001663/* Split a string giving a list of Unicode strings.
1664
1665 If sep is NULL, splitting will be done at all whitespace
1666 substrings. Otherwise, splits occur at the given separator.
1667
1668 At most maxsplit splits will be done. If negative, no limit is set.
1669
1670 Separators are not included in the resulting list.
1671
1672*/
1673
Mark Hammond91a681d2002-08-12 07:21:58 +00001674PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001675 PyObject *s, /* String to split */
1676 PyObject *sep, /* String separator */
1677 Py_ssize_t maxsplit /* Maxsplit count */
1678 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001679
1680/* Dito, but split at line breaks.
1681
1682 CRLF is considered to be one line break. Line breaks are not
1683 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001684
Mark Hammond91a681d2002-08-12 07:21:58 +00001685PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001686 PyObject *s, /* String to split */
1687 int keepends /* If true, line end markers are included */
1688 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001689
Thomas Wouters477c8d52006-05-27 19:21:47 +00001690/* Partition a string using a given separator. */
1691
1692PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001693 PyObject *s, /* String to partition */
1694 PyObject *sep /* String separator */
1695 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001696
1697/* Partition a string using a given separator, searching from the end of the
1698 string. */
1699
1700PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001701 PyObject *s, /* String to partition */
1702 PyObject *sep /* String separator */
1703 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001704
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001705/* Split a string giving a list of Unicode strings.
1706
1707 If sep is NULL, splitting will be done at all whitespace
1708 substrings. Otherwise, splits occur at the given separator.
1709
1710 At most maxsplit splits will be done. But unlike PyUnicode_Split
1711 PyUnicode_RSplit splits from the end of the string. If negative,
1712 no limit is set.
1713
1714 Separators are not included in the resulting list.
1715
1716*/
1717
1718PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 PyObject *s, /* String to split */
1720 PyObject *sep, /* String separator */
1721 Py_ssize_t maxsplit /* Maxsplit count */
1722 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001723
Guido van Rossumd8225182000-03-10 22:33:05 +00001724/* Translate a string by applying a character mapping table to it and
1725 return the resulting Unicode object.
1726
1727 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001729
1730 Mapping tables may be dictionaries or sequences. Unmapped character
1731 ordinals (ones which cause a LookupError) are left untouched and
1732 are copied as-is.
1733
1734*/
1735
Mark Hammond91a681d2002-08-12 07:21:58 +00001736PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001737 PyObject *str, /* String */
1738 PyObject *table, /* Translate table */
1739 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001740 );
1741
1742/* Join a sequence of strings using the given separator and return
1743 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001744
Mark Hammond91a681d2002-08-12 07:21:58 +00001745PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001746 PyObject *separator, /* Separator string */
1747 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001748 );
1749
1750/* Return 1 if substr matches str[start:end] at the given tail end, 0
1751 otherwise. */
1752
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001754 PyObject *str, /* String */
1755 PyObject *substr, /* Prefix or Suffix string */
1756 Py_ssize_t start, /* Start index */
1757 Py_ssize_t end, /* Stop index */
1758 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001759 );
1760
1761/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001762 given search direction or -1 if not found. -2 is returned in case
1763 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001764
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001766 PyObject *str, /* String */
1767 PyObject *substr, /* Substring to find */
1768 Py_ssize_t start, /* Start index */
1769 Py_ssize_t end, /* Stop index */
1770 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001771 );
1772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773/* Like PyUnicode_Find, but search for single character only. */
1774PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1775 PyObject *str,
1776 Py_UCS4 ch,
1777 Py_ssize_t start,
1778 Py_ssize_t end,
1779 int direction
1780 );
1781
Barry Warsaw51ac5802000-03-20 16:36:48 +00001782/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001783
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001785 PyObject *str, /* String */
1786 PyObject *substr, /* Substring to count */
1787 Py_ssize_t start, /* Start index */
1788 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001789 );
1790
Barry Warsaw51ac5802000-03-20 16:36:48 +00001791/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001792 and return the resulting Unicode object. */
1793
Mark Hammond91a681d2002-08-12 07:21:58 +00001794PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001795 PyObject *str, /* String */
1796 PyObject *substr, /* Substring to find */
1797 PyObject *replstr, /* Substring to replace */
1798 Py_ssize_t maxcount /* Max. number of replacements to apply;
1799 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001800 );
1801
1802/* Compare two strings and return -1, 0, 1 for less than, equal,
1803 greater than resp. */
1804
Mark Hammond91a681d2002-08-12 07:21:58 +00001805PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001806 PyObject *left, /* Left string */
1807 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001808 );
1809
Martin v. Löwis5b222132007-06-10 09:51:05 +00001810PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1811 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001812 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001813 );
1814
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001815/* Rich compare two strings and return one of the following:
1816
1817 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001818 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001819 - Py_NotImplemented in case the type combination is unknown
1820
1821 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1822 case the conversion of the arguments to Unicode fails with a
1823 UnicodeDecodeError.
1824
1825 Possible values for op:
1826
1827 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1828
1829*/
1830
1831PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001832 PyObject *left, /* Left string */
1833 PyObject *right, /* Right string */
1834 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001835 );
1836
Thomas Wouters7e474022000-07-16 12:04:32 +00001837/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001838 the resulting Unicode string. */
1839
Mark Hammond91a681d2002-08-12 07:21:58 +00001840PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001841 PyObject *format, /* Format string */
1842 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001843 );
1844
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001845/* Checks whether element is contained in container and return 1/0
1846 accordingly.
1847
1848 element has to coerce to an one element Unicode string. -1 is
1849 returned in case of an error. */
1850
Mark Hammond91a681d2002-08-12 07:21:58 +00001851PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001852 PyObject *container, /* Container string */
1853 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001854 );
1855
Martin v. Löwis47383402007-08-15 07:32:56 +00001856/* Checks whether argument is a valid identifier. */
1857
1858PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1859
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001860#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001861/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001862PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001863 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001864 int striptype,
1865 PyObject *sepobj
1866 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001867#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001868
Eric Smith5807c412008-05-11 21:00:57 +00001869/* Using the current locale, insert the thousands grouping
1870 into the string pointed to by buffer. For the argument descriptions,
1871 see Objects/stringlib/localeutil.h */
1872
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001873#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001874PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1875 Py_ssize_t n_buffer,
1876 Py_UNICODE *digits,
1877 Py_ssize_t n_digits,
1878 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001879#endif
Eric Smith5807c412008-05-11 21:00:57 +00001880
Eric Smitha3b1ac82009-04-03 14:45:06 +00001881/* Using explicit passed-in values, insert the thousands grouping
1882 into the string pointed to by buffer. For the argument descriptions,
1883 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001884#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001886 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 int kind,
1888 void *buffer,
1889 Py_ssize_t n_buffer,
1890 void *digits,
1891 Py_ssize_t n_digits,
1892 Py_ssize_t min_width,
1893 const char *grouping,
1894 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001895#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001896/* === Characters Type APIs =============================================== */
1897
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001898/* Helper array used by Py_UNICODE_ISSPACE(). */
1899
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001900#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001901PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1902
Guido van Rossumd8225182000-03-10 22:33:05 +00001903/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001904 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001905
1906 These APIs are implemented in Objects/unicodectype.c.
1907
1908*/
1909
Mark Hammond91a681d2002-08-12 07:21:58 +00001910PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001911 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001912 );
1913
Mark Hammond91a681d2002-08-12 07:21:58 +00001914PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001915 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001916 );
1917
Mark Hammond91a681d2002-08-12 07:21:58 +00001918PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001919 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001920 );
1921
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001922PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001923 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001924 );
1925
1926PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001927 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001928 );
1929
Mark Hammond91a681d2002-08-12 07:21:58 +00001930PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001931 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
Mark Hammond91a681d2002-08-12 07:21:58 +00001934PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001935 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001936 );
1937
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001938PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1939 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001942PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1943 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001944 );
1945
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001946PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1947 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001948 );
1949
Mark Hammond91a681d2002-08-12 07:21:58 +00001950PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001951 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001952 );
1953
Mark Hammond91a681d2002-08-12 07:21:58 +00001954PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001955 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001956 );
1957
Mark Hammond91a681d2002-08-12 07:21:58 +00001958PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001959 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001960 );
1961
Mark Hammond91a681d2002-08-12 07:21:58 +00001962PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001963 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964 );
1965
Mark Hammond91a681d2002-08-12 07:21:58 +00001966PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001967 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001968 );
1969
Mark Hammond91a681d2002-08-12 07:21:58 +00001970PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001971 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001972 );
1973
Georg Brandl559e5d72008-06-11 18:37:52 +00001974PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001975 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001976 );
1977
Mark Hammond91a681d2002-08-12 07:21:58 +00001978PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001979 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001980 );
1981
Victor Stinneref8d95c2010-08-16 22:03:11 +00001982PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1983 const Py_UNICODE *u
1984 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001985
1986PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001987 Py_UNICODE *s1,
1988 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001989
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001990PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1991 Py_UNICODE *s1, const Py_UNICODE *s2);
1992
Martin v. Löwis5b222132007-06-10 09:51:05 +00001993PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001994 Py_UNICODE *s1,
1995 const Py_UNICODE *s2,
1996 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001997
1998PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001999 const Py_UNICODE *s1,
2000 const Py_UNICODE *s2
2001 );
2002
2003PyAPI_FUNC(int) Py_UNICODE_strncmp(
2004 const Py_UNICODE *s1,
2005 const Py_UNICODE *s2,
2006 size_t n
2007 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002008
2009PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002010 const Py_UNICODE *s,
2011 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002012 );
2013
Victor Stinner331ea922010-08-10 16:37:20 +00002014PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002015 const Py_UNICODE *s,
2016 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002017 );
2018
Victor Stinner71133ff2010-09-01 23:43:53 +00002019/* Create a copy of a unicode string ending with a nul character. Return NULL
2020 and raise a MemoryError exception on memory allocation failure, otherwise
2021 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2022
Victor Stinner46408602010-09-03 16:18:00 +00002023PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002024 PyObject *unicode
2025 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002026#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002027
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002028#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002029PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002030 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002031 int check_content);
2032#endif
2033
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002034/********************* String Literals ****************************************/
2035/* This structure helps managing static strings. The basic usage goes like this:
2036 Instead of doing
2037
2038 r = PyObject_CallMethod(o, "foo", "args", ...);
2039
2040 do
2041
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002042 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002043 ...
2044 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2045
2046 PyId_foo is a static variable, either on block level or file level. On first
2047 usage, the string "foo" is interned, and the structures are linked. On interpreter
2048 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2049
2050 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002051 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002052 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2053*/
2054typedef struct _Py_Identifier {
2055 struct _Py_Identifier *next;
2056 const char* string;
2057 PyObject *object;
2058} _Py_Identifier;
2059
Martin v. Löwis87da8722011-10-09 11:54:42 +02002060#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002061#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002062
2063/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2064PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2065/* Clear all static strings. */
2066PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2067
Guido van Rossumd8225182000-03-10 22:33:05 +00002068#ifdef __cplusplus
2069}
2070#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002071#endif /* !Py_UNICODEOBJECT_H */