blob: 6a31e48836f8181e9811734696c82d839da94c24 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119 unicode representations. */
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100120#if SIZEOF_INT == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121typedef unsigned int Py_UCS4;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100122#elif SIZEOF_LONG == 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100128#if SIZEOF_SHORT == 2
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129typedef unsigned short Py_UCS2;
Victor Stinner7c8bbbb2011-11-20 18:28:29 +0100130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134typedef unsigned char Py_UCS1;
135
Guido van Rossumd8225182000-03-10 22:33:05 +0000136/* --- Internal Unicode Operations ---------------------------------------- */
137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138/* Since splitting on whitespace is an important use case, and
139 whitespace in most situations is solely ASCII whitespace, we
140 optimize for the common case by using a quick look-up table
141 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000142
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000143 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000144#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000145#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000167
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200174#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000177#define Py_UNICODE_FILL(target, value, length) \
178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000181
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low) \
188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000191/* Check if substring matches at given offset. The offset must be
192 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000193
Thomas Wouters477c8d52006-05-27 19:21:47 +0000194#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200195 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
196 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
197 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
198
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000199#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000200
Barry Warsaw51ac5802000-03-20 16:36:48 +0000201#ifdef __cplusplus
202extern "C" {
203#endif
204
Guido van Rossumd8225182000-03-10 22:33:05 +0000205/* --- Unicode Type ------------------------------------------------------- */
206
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000207#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200208
209/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
210 structure. state.ascii and state.compact are set, and the data
211 immediately follow the structure. utf8_length and wstr_length can be found
212 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000213typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200214 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200215
216 - compact ascii:
217
218 * structure = PyASCIIObject
219 * kind = PyUnicode_1BYTE_KIND
220 * compact = 1
221 * ascii = 1
222 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200223 * (length is the length of the utf8 and wstr strings)
224 * (data starts just after the structure)
225 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200226
227 - compact:
228
229 * structure = PyCompactUnicodeObject
230 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
231 PyUnicode_4BYTE_KIND
232 * compact = 1
233 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200234 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200235 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200236 * utf8_length = 0 if utf8 is NULL
237 * wstr is shared with data and wstr_length=length
238 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100239 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200240 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200242
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200243 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200244
245 * structure = PyUnicodeObject
Victor Stinnere30c0a12011-11-04 20:54:05 +0100246 * length = 0 (use wstr_length)
247 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200248 * kind = PyUnicode_WCHAR_KIND
249 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200250 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200251 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100252 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200253 * wstr is not NULL
254 * data.any is NULL
255 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200256 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200257
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200258 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200259
260 * structure = PyUnicodeObject structure
261 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
262 PyUnicode_4BYTE_KIND
263 * compact = 0
264 * ready = 1
265 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200266 * utf8 is shared and utf8_length = length with data.any if ascii = 1
267 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100268 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200269 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
270 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
271 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200272
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200273 Compact strings use only one memory block (structure + characters),
274 whereas legacy strings use one block for the structure and one block
275 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200276
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200277 Legacy strings are created by PyUnicode_FromUnicode() and
278 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
279 when PyUnicode_READY() is called.
280
281 See also _PyUnicode_CheckConsistency().
282 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000283 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200284 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000285 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286 struct {
287 /*
288 SSTATE_NOT_INTERNED (0)
289 SSTATE_INTERNED_MORTAL (1)
290 SSTATE_INTERNED_IMMORTAL (2)
291
292 If interned != SSTATE_NOT_INTERNED, the two references from the
293 dictionary to this object are *not* counted in ob_refcnt.
294 */
295 unsigned int interned:2;
296 /* Character size:
297
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200298 - PyUnicode_WCHAR_KIND (0):
299
300 * character type = wchar_t (16 or 32 bits, depending on the
301 platform)
302
303 - PyUnicode_1BYTE_KIND (1):
304
305 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100306 * all characters are in the range U+0000-U+00FF (latin1)
307 * if ascii is set, all characters are in the range U+0000-U+007F
308 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200309 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200310
311 - PyUnicode_2BYTE_KIND (2):
312
313 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100314 * all characters are in the range U+0000-U+FFFF (BMP)
315 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200316
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200317 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200318
319 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100320 * all characters are in the range U+0000-U+10FFFF
321 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200323 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 /* Compact is with respect to the allocation scheme. Compact unicode
325 objects only require one memory block while non-compact objects use
326 one block for the PyUnicodeObject struct and another for its data
327 buffer. */
328 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100329 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200330 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
331 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200332 unsigned int ascii:1;
333 /* The ready flag indicates whether the object layout is initialized
334 completely. This means that this is either a compact object, or
335 the data pointer is filled out. The bit is redundant, and helps
336 to minimize the test in PyUnicode_IS_READY(). */
337 unsigned int ready:1;
338 } state;
339 wchar_t *wstr; /* wchar_t representation (null-terminated) */
340} PyASCIIObject;
341
342/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200343 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200344 immediately follow the structure. */
345typedef struct {
346 PyASCIIObject _base;
347 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
348 * terminating \0. */
349 char *utf8; /* UTF-8 representation (null-terminated) */
350 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
351 * surrogates count as two code points. */
352} PyCompactUnicodeObject;
353
354/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
355 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200356 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357typedef struct {
358 PyCompactUnicodeObject _base;
359 union {
360 void *any;
361 Py_UCS1 *latin1;
362 Py_UCS2 *ucs2;
363 Py_UCS4 *ucs4;
364 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000365} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000366#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000367
Mark Hammond91a681d2002-08-12 07:21:58 +0000368PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000369PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000370
Thomas Wouters27d517b2007-02-25 20:39:11 +0000371#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000372 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
373#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000374
375/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000376#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377
378#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200379 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 ((PyASCIIObject*)op)->length : \
381 ((PyCompactUnicodeObject*)op)->wstr_length)
382
383/* Returns the deprecated Py_UNICODE representation's size in code units
384 (this includes surrogate pairs as 2 units).
385 If the Py_UNICODE representation is not available, it will be computed
386 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
387
Guido van Rossumd8225182000-03-10 22:33:05 +0000388#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200389 (assert(PyUnicode_Check(op)), \
390 (((PyASCIIObject *)(op))->wstr) ? \
391 PyUnicode_WSTR_LENGTH(op) : \
392 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
393 PyUnicode_WSTR_LENGTH(op)))
394
Guido van Rossumd8225182000-03-10 22:33:05 +0000395#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
397
398/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
399 representation on demand. Using this macro is very inefficient now,
400 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
401 use PyUnicode_WRITE() and PyUnicode_READ(). */
402
Guido van Rossumd8225182000-03-10 22:33:05 +0000403#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 (assert(PyUnicode_Check(op)), \
405 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
406 PyUnicode_AsUnicode((PyObject *)(op)))
407
Guido van Rossumd8225182000-03-10 22:33:05 +0000408#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409 ((const char *)(PyUnicode_AS_UNICODE(op)))
410
411
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200412/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200413
Victor Stinner6f9568b2011-11-17 00:12:44 +0100414/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200415
416/* Interning state. */
417#define SSTATE_NOT_INTERNED 0
418#define SSTATE_INTERNED_MORTAL 1
419#define SSTATE_INTERNED_IMMORTAL 2
420
Victor Stinnera3b334d2011-10-03 13:53:37 +0200421/* Return true if the string contains only ASCII characters, or 0 if not. The
422 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
423 or Ready calls are performed. */
424#define PyUnicode_IS_ASCII(op) \
425 (((PyASCIIObject*)op)->state.ascii)
426
427/* Return true if the string is compact or 0 if not.
428 No type checks or Ready calls are performed. */
429#define PyUnicode_IS_COMPACT(op) \
430 (((PyASCIIObject*)(op))->state.compact)
431
432/* Return true if the string is a compact ASCII string (use PyASCIIObject
433 structure), or 0 if not. No type checks or Ready calls are performed. */
434#define PyUnicode_IS_COMPACT_ASCII(op) \
435 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200437enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200438/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200439 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200440 has not been called yet. */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200441 PyUnicode_WCHAR_KIND = 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200443 PyUnicode_1BYTE_KIND = 1,
444 PyUnicode_2BYTE_KIND = 2,
445 PyUnicode_4BYTE_KIND = 4
446};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447
Georg Brandl4975a9b2011-10-05 16:12:21 +0200448/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200449 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200450 No checks are performed, use PyUnicode_KIND() before to ensure
451 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452
453#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
454#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
455#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
456
Victor Stinner157f83f2011-09-28 21:41:31 +0200457/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#define PyUnicode_KIND(op) \
459 (assert(PyUnicode_Check(op)), \
460 assert(PyUnicode_IS_READY(op)), \
461 ((PyASCIIObject *)(op))->state.kind)
462
Victor Stinner157f83f2011-09-28 21:41:31 +0200463/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200465 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466 ((void*)((PyASCIIObject*)(op) + 1)) : \
467 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
468
469#define _PyUnicode_NONCOMPACT_DATA(op) \
470 (assert(((PyUnicodeObject*)(op))->data.any), \
471 ((((PyUnicodeObject *)(op))->data.any)))
472
473#define PyUnicode_DATA(op) \
474 (assert(PyUnicode_Check(op)), \
475 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
476 _PyUnicode_NONCOMPACT_DATA(op))
477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200478/* In the access macros below, "kind" may be evaluated more than once.
479 All other macro parameters are evaluated exactly once, so it is safe
480 to put side effects into them (such as increasing the index). */
481
482/* Write into the canonical representation, this macro does not do any sanity
483 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200484 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200485 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200486 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200487#define PyUnicode_WRITE(kind, data, index, value) \
488 do { \
489 switch ((kind)) { \
490 case PyUnicode_1BYTE_KIND: { \
491 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
492 break; \
493 } \
494 case PyUnicode_2BYTE_KIND: { \
495 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
496 break; \
497 } \
498 default: { \
499 assert((kind) == PyUnicode_4BYTE_KIND); \
500 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
501 } \
502 } \
503 } while (0)
504
Georg Brandl07de3252011-10-05 16:47:38 +0200505/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200506 or ready calls are performed. */
507#define PyUnicode_READ(kind, data, index) \
508 ((Py_UCS4) \
509 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200510 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 ((kind) == PyUnicode_2BYTE_KIND ? \
512 ((const Py_UCS2 *)(data))[(index)] : \
513 ((const Py_UCS4 *)(data))[(index)] \
514 ) \
515 ))
516
517/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
518 calls PyUnicode_KIND() and might call it twice. For single reads, use
519 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
520 cache kind and use PyUnicode_READ instead. */
521#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200522 (assert(PyUnicode_Check(unicode)), \
523 assert(PyUnicode_IS_READY(unicode)), \
524 (Py_UCS4) \
525 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
526 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
527 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
528 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
529 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
530 ) \
531 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532
533/* Returns the length of the unicode string. The caller has to make sure that
534 the string has it's canonical representation set before calling
535 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
536#define PyUnicode_GET_LENGTH(op) \
537 (assert(PyUnicode_Check(op)), \
538 assert(PyUnicode_IS_READY(op)), \
539 ((PyASCIIObject *)(op))->length)
540
541
542/* Fast check to determine whether an object is ready. Equivalent to
543 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
544
545#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
546
Victor Stinnera3b334d2011-10-03 13:53:37 +0200547/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200549 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 Returns 0 on success and -1 on errors. */
551#define PyUnicode_READY(op) \
552 (assert(PyUnicode_Check(op)), \
553 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200554 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556/* Return a maximum character value which is suitable for creating another
557 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200558 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559#define PyUnicode_MAX_CHAR_VALUE(op) \
560 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200561 (PyUnicode_IS_ASCII(op) ? \
562 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200563 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200564 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200566 (0xffffU) : \
567 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200568
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000569#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000570
571/* --- Constants ---------------------------------------------------------- */
572
573/* This Unicode character will be used as replacement character during
574 decoding if the errors argument is set to "replace". Note: the
575 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
576 Unicode 3.0. */
577
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200578#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000579
580/* === Public API ========================================================= */
581
582/* --- Plain Py_UNICODE --------------------------------------------------- */
583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200584/* With PEP 393, this is the recommended way to allocate a new unicode object.
585 This function will allocate the object and its buffer in a single memory
586 block. Objects created using this function are not resizable. */
587#ifndef Py_LIMITED_API
588PyAPI_FUNC(PyObject*) PyUnicode_New(
589 Py_ssize_t size, /* Number of code points in the new string */
590 Py_UCS4 maxchar /* maximum code point value in the string */
591 );
592#endif
593
Victor Stinnerd8f65102011-09-29 19:43:17 +0200594/* Initializes the canonical string representation from a the deprecated
595 wstr/Py_UNICODE representation. This function is used to convert Unicode
596 objects which were created using the old API to the new flexible format
597 introduced with PEP 393.
598
599 Don't call this function directly, use the public PyUnicode_READY() macro
600 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601#ifndef Py_LIMITED_API
602PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200603 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604 );
605#endif
606
Victor Stinner034f6cf2011-09-30 02:26:44 +0200607/* Get a copy of a Unicode string. */
608PyAPI_FUNC(PyObject*) PyUnicode_Copy(
609 PyObject *unicode
610 );
611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200613 character conversion when necessary and falls back to memcpy if possible.
614
Victor Stinnera0702ab2011-09-29 14:14:38 +0200615 Fail if to is too small (smaller than how_many or smaller than
616 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
617 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618
619 Return the number of written character, or return -1 and raise an exception
620 on error.
621
622 Pseudo-code:
623
624 how_many = min(how_many, len(from) - from_start)
625 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
626 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200627
628 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200629 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200631PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632 PyObject *to,
633 Py_ssize_t to_start,
634 PyObject *from,
635 Py_ssize_t from_start,
636 Py_ssize_t how_many
637 );
638#endif
639
Guido van Rossumd8225182000-03-10 22:33:05 +0000640/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000641 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000642
643 u may be NULL which causes the contents to be undefined. It is the
644 user's responsibility to fill in the needed data afterwards. Note
645 that modifying the Unicode object contents after construction is
646 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000647
648 The buffer is copied into the new object. */
649
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000650#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000651PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000652 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000653 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000654 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000655#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000656
Georg Brandl952867a2010-06-27 10:17:12 +0000657/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000658PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000659 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000660 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000661 );
662
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000663/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000665PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000666 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000667 );
668
Victor Stinnerb9275c12011-10-05 14:01:42 +0200669/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
670 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671#ifndef Py_LIMITED_API
672PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
673 int kind,
674 const void *buffer,
675 Py_ssize_t size);
676#endif
677
678PyAPI_FUNC(PyObject*) PyUnicode_Substring(
679 PyObject *str,
680 Py_ssize_t start,
681 Py_ssize_t end);
682
Georg Brandldb6c7f52011-10-07 11:19:11 +0200683/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 is set. Return NULL and raise an exception on error. Raise a ValueError if
685 the buffer is smaller than the string. Return buffer on success.
686
687 buflen is the length of the buffer in (Py_UCS4) characters. */
688PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
689 PyObject *unicode,
690 Py_UCS4* buffer,
691 Py_ssize_t buflen,
692 int copy_null);
693
694/* Copy the string into a UCS4 buffer. A new buffer is allocated using
695 * PyMem_Malloc; if this fails, NULL is returned with a memory error
696 exception set. */
697PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
698
Guido van Rossumd8225182000-03-10 22:33:05 +0000699/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 Py_UNICODE buffer.
701 If the wchar_t/Py_UNICODE representation is not yet available, this
702 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000703
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000704#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000705PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000707 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000708#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710/* Return a read-only pointer to the Unicode object's internal
711 Py_UNICODE buffer and save the length at size.
712 If the wchar_t/Py_UNICODE representation is not yet available, this
713 function will calculate it. */
714
715#ifndef Py_LIMITED_API
716PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
717 PyObject *unicode, /* Unicode object */
718 Py_ssize_t *size /* location where to save the length */
719 );
720#endif
721
Guido van Rossumd8225182000-03-10 22:33:05 +0000722/* Get the length of the Unicode object. */
723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
725 PyObject *unicode
726);
727
Victor Stinner157f83f2011-09-28 21:41:31 +0200728/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 string representation. */
730
Martin v. Löwis18e16552006-02-15 17:27:45 +0000731PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000733 );
734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735/* Read a character from the string. */
736
737PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
738 PyObject *unicode,
739 Py_ssize_t index
740 );
741
742/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200743 PyUnicode_New, must not be shared, and must not have been hashed yet.
744
745 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746
747PyAPI_FUNC(int) PyUnicode_WriteChar(
748 PyObject *unicode,
749 Py_ssize_t index,
750 Py_UCS4 character
751 );
752
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000753#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000754/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000755PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000756#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000757
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200758/* Resize an Unicode object allocated by the legacy API (e.g.
759 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
760 PyUnicode_New) cannot be resized by this function.
761
Victor Stinner93439992011-11-20 18:29:14 +0100762 The length is a number of characters (and not the number of Py_UNICODE characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000763
764 *unicode is modified to point to the new (resized) object and 0
765 returned on success.
766
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200767 If the refcount on the object is 1, the function resizes the string in
768 place, which is usually faster than allocating a new string (and copy
769 characters).
Guido van Rossum52c23592000-04-10 13:41:41 +0000770
771 Error handling is implemented as follows: an exception is set, -1
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200772 is returned and *unicode left untouched. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000773
Mark Hammond91a681d2002-08-12 07:21:58 +0000774PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 PyObject **unicode, /* Pointer to the Unicode object */
776 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000777 );
778
Guido van Rossumd8225182000-03-10 22:33:05 +0000779/* Coerce obj to an Unicode object and return a reference with
780 *incremented* refcount.
781
782 Coercion is done in the following way:
783
Georg Brandl952867a2010-06-27 10:17:12 +0000784 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000785 under the assumptions that they contain data using the UTF-8
786 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000787
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000788 2. All other objects (including Unicode objects) raise an
789 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000790
791 The API returns NULL in case of an error. The caller is responsible
792 for decref'ing the returned objects.
793
794*/
795
Mark Hammond91a681d2002-08-12 07:21:58 +0000796PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000798 const char *encoding, /* encoding */
799 const char *errors /* error handling */
800 );
801
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000802/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000803 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000805 Unicode objects are passed back as-is (subclasses are converted to
806 true Unicode objects), all other objects are delegated to
807 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000808 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000809
810 The API returns NULL in case of an error. The caller is responsible
811 for decref'ing the returned objects.
812
813*/
814
Mark Hammond91a681d2002-08-12 07:21:58 +0000815PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000817 );
818
Victor Stinner1205f272010-09-11 00:54:47 +0000819PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
820 const char *format, /* ASCII-encoded string */
821 va_list vargs
822 );
823PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
824 const char *format, /* ASCII-encoded string */
825 ...
826 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000827
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000828#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000829/* Format the object based on the format_spec, as defined in PEP 3101
830 (Advanced String Formatting). */
831PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 PyObject *format_spec,
833 Py_ssize_t start,
834 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000835#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000836
Walter Dörwald16807132007-05-25 13:52:07 +0000837PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
838PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000839PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
840 const char *u /* UTF-8 encoded string */
841 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000842#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000843PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000844#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000845
846/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847#define PyUnicode_CHECK_INTERNED(op) \
848 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000849
Guido van Rossumd8225182000-03-10 22:33:05 +0000850/* --- wchar_t support for platforms which support it --------------------- */
851
852#ifdef HAVE_WCHAR_H
853
Georg Brandl952867a2010-06-27 10:17:12 +0000854/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000855 size.
856
857 The buffer is copied into the new object. */
858
Mark Hammond91a681d2002-08-12 07:21:58 +0000859PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000860 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000861 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000862 );
863
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000864/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000865 most size wchar_t characters are copied.
866
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000867 Note that the resulting wchar_t string may or may not be
868 0-terminated. It is the responsibility of the caller to make sure
869 that the wchar_t string is 0-terminated in case this is required by
870 the application.
871
872 Returns the number of wchar_t characters copied (excluding a
873 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000874 error. */
875
Martin v. Löwis18e16552006-02-15 17:27:45 +0000876PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000877 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000878 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000879 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000880 );
881
Victor Stinner137c34c2010-09-29 10:25:54 +0000882/* Convert the Unicode object to a wide character string. The output string
883 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200884 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000885
886 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
887 on success. On error, returns NULL, *size is undefined and raises a
888 MemoryError. */
889
890PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000891 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000892 Py_ssize_t *size /* number of characters of the result */
893 );
894
Victor Stinner9f789e72011-10-01 03:57:28 +0200895#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +0200897#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898
Guido van Rossumd8225182000-03-10 22:33:05 +0000899#endif
900
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000901/* --- Unicode ordinals --------------------------------------------------- */
902
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903/* Create a Unicode Object from the given Unicode code point ordinal.
904
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000905 The ordinal must be in range(0x10000) on narrow Python builds
906 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
907 raised in case it is not.
908
909*/
910
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000911PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000913/* --- Free-list management ----------------------------------------------- */
914
915/* Clear the free list used by the Unicode implementation.
916
917 This can be used to release memory used for objects on the free
918 list back to the Python memory allocator.
919
920*/
921
922PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
923
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000925
926 Many of these APIs take two arguments encoding and errors. These
927 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000928 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000929
Georg Brandl952867a2010-06-27 10:17:12 +0000930 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000931
932 Error handling is set by errors which may also be set to NULL
933 meaning to use the default handling defined for the codec. Default
934 error handling for all builtin codecs is "strict" (ValueErrors are
935 raised).
936
937 The codecs all use a similar interface. Only deviation from the
938 generic ones are documented.
939
940*/
941
Fred Drakecb093fe2000-05-09 19:51:53 +0000942/* --- Manage the default encoding ---------------------------------------- */
943
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000944/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000945 Unicode object unicode and the size of the encoded representation
946 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000947
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000948 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000949
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200950 This function caches the UTF-8 encoded string in the unicodeobject
951 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952 when the unicodeobject is deallocated.
953
954 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
955 support the previous internal function with the same behaviour.
956
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000957 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000958 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000959
960 *** If you need to access the Unicode object as UTF-8 bytes string,
961 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000962*/
963
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000964#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000967 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000969#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000970
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000971/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000972 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
975 in the unicodeobject.
976
977 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
978 support the previous internal function with the same behaviour.
979
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000980 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000981 extracted from the returned data.
982
983 *** This API is for interpreter INTERNAL USE ONLY and will likely
984 *** be removed or changed for Python 3.1.
985
986 *** If you need to access the Unicode object as UTF-8 bytes string,
987 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000988
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000989*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000990
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000991#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
993#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000994#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000995
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000996/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000997
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000999
Guido van Rossumd8225182000-03-10 22:33:05 +00001000/* --- Generic Codecs ----------------------------------------------------- */
1001
1002/* Create a Unicode object by decoding the encoded string s of the
1003 given size. */
1004
Mark Hammond91a681d2002-08-12 07:21:58 +00001005PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001006 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001007 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001008 const char *encoding, /* encoding */
1009 const char *errors /* error handling */
1010 );
1011
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001012/* Decode a Unicode object unicode and return the result as Python
1013 object. */
1014
1015PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 PyObject *unicode, /* Unicode object */
1017 const char *encoding, /* encoding */
1018 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001019 );
1020
1021/* Decode a Unicode object unicode and return the result as Unicode
1022 object. */
1023
1024PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001025 PyObject *unicode, /* Unicode object */
1026 const char *encoding, /* encoding */
1027 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001028 );
1029
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001031 Python string object. */
1032
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001033#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001034PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001035 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001036 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001037 const char *encoding, /* encoding */
1038 const char *errors /* error handling */
1039 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001040#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001041
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001042/* Encodes a Unicode object and returns the result as Python
1043 object. */
1044
1045PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 PyObject *unicode, /* Unicode object */
1047 const char *encoding, /* encoding */
1048 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001049 );
1050
Guido van Rossumd8225182000-03-10 22:33:05 +00001051/* Encodes a Unicode object and returns the result as Python string
1052 object. */
1053
Mark Hammond91a681d2002-08-12 07:21:58 +00001054PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 PyObject *unicode, /* Unicode object */
1056 const char *encoding, /* encoding */
1057 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001058 );
1059
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001060/* Encodes a Unicode object and returns the result as Unicode
1061 object. */
1062
1063PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064 PyObject *unicode, /* Unicode object */
1065 const char *encoding, /* encoding */
1066 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001067 );
1068
1069/* Build an encoding map. */
1070
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001071PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1072 PyObject* string /* 256 character map */
1073 );
1074
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001075/* --- UTF-7 Codecs ------------------------------------------------------- */
1076
Mark Hammond91a681d2002-08-12 07:21:58 +00001077PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 const char *string, /* UTF-7 encoded string */
1079 Py_ssize_t length, /* size of string */
1080 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 );
1082
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001083PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001084 const char *string, /* UTF-7 encoded string */
1085 Py_ssize_t length, /* size of string */
1086 const char *errors, /* error handling */
1087 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001088 );
1089
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001090#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001091PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 const Py_UNICODE *data, /* Unicode char buffer */
1093 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1094 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1095 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1096 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001097 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001098PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1099 PyObject *unicode, /* Unicode object */
1100 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1101 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1102 const char *errors /* error handling */
1103 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001104#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001105
Guido van Rossumd8225182000-03-10 22:33:05 +00001106/* --- UTF-8 Codecs ------------------------------------------------------- */
1107
Mark Hammond91a681d2002-08-12 07:21:58 +00001108PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109 const char *string, /* UTF-8 encoded string */
1110 Py_ssize_t length, /* size of string */
1111 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001112 );
1113
Walter Dörwald69652032004-09-07 20:24:22 +00001114PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 const char *string, /* UTF-8 encoded string */
1116 Py_ssize_t length, /* size of string */
1117 const char *errors, /* error handling */
1118 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001119 );
1120
Mark Hammond91a681d2002-08-12 07:21:58 +00001121PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001123 );
1124
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001125#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1127 PyObject *unicode,
1128 const char *errors);
1129
Mark Hammond91a681d2002-08-12 07:21:58 +00001130PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 const Py_UNICODE *data, /* Unicode char buffer */
1132 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1133 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001134 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001135#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001136
Walter Dörwald41980ca2007-08-16 21:55:45 +00001137/* --- UTF-32 Codecs ------------------------------------------------------ */
1138
1139/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1140 the corresponding Unicode object.
1141
1142 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001144
1145 If byteorder is non-NULL, the decoder starts decoding using the
1146 given byte order:
1147
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 *byteorder == -1: little endian
1149 *byteorder == 0: native order
1150 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001151
1152 In native mode, the first four bytes of the stream are checked for a
1153 BOM mark. If found, the BOM mark is analysed, the byte order
1154 adjusted and the BOM skipped. In the other modes, no BOM mark
1155 interpretation is done. After completion, *byteorder is set to the
1156 current byte order at the end of input data.
1157
1158 If byteorder is NULL, the codec starts in native order mode.
1159
1160*/
1161
1162PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 const char *string, /* UTF-32 encoded string */
1164 Py_ssize_t length, /* size of string */
1165 const char *errors, /* error handling */
1166 int *byteorder /* pointer to byteorder to use
1167 0=native;-1=LE,1=BE; updated on
1168 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001169 );
1170
1171PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 const char *string, /* UTF-32 encoded string */
1173 Py_ssize_t length, /* size of string */
1174 const char *errors, /* error handling */
1175 int *byteorder, /* pointer to byteorder to use
1176 0=native;-1=LE,1=BE; updated on
1177 exit */
1178 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001179 );
1180
1181/* Returns a Python string using the UTF-32 encoding in native byte
1182 order. The string always starts with a BOM mark. */
1183
1184PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001186 );
1187
1188/* Returns a Python string object holding the UTF-32 encoded value of
1189 the Unicode data.
1190
1191 If byteorder is not 0, output is written according to the following
1192 byte order:
1193
1194 byteorder == -1: little endian
1195 byteorder == 0: native byte order (writes a BOM mark)
1196 byteorder == 1: big endian
1197
1198 If byteorder is 0, the output string will always start with the
1199 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1200 prepended.
1201
1202*/
1203
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001204#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001205PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 const Py_UNICODE *data, /* Unicode char buffer */
1207 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1208 const char *errors, /* error handling */
1209 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001210 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001211PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1212 PyObject *object, /* Unicode object */
1213 const char *errors, /* error handling */
1214 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1215 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001216#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001217
Guido van Rossumd8225182000-03-10 22:33:05 +00001218/* --- UTF-16 Codecs ------------------------------------------------------ */
1219
Guido van Rossum9e896b32000-04-05 20:11:21 +00001220/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001221 the corresponding Unicode object.
1222
1223 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001225
1226 If byteorder is non-NULL, the decoder starts decoding using the
1227 given byte order:
1228
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 *byteorder == -1: little endian
1230 *byteorder == 0: native order
1231 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001232
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001233 In native mode, the first two bytes of the stream are checked for a
1234 BOM mark. If found, the BOM mark is analysed, the byte order
1235 adjusted and the BOM skipped. In the other modes, no BOM mark
1236 interpretation is done. After completion, *byteorder is set to the
1237 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001238
1239 If byteorder is NULL, the codec starts in native order mode.
1240
1241*/
1242
Mark Hammond91a681d2002-08-12 07:21:58 +00001243PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 const char *string, /* UTF-16 encoded string */
1245 Py_ssize_t length, /* size of string */
1246 const char *errors, /* error handling */
1247 int *byteorder /* pointer to byteorder to use
1248 0=native;-1=LE,1=BE; updated on
1249 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001250 );
1251
Walter Dörwald69652032004-09-07 20:24:22 +00001252PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 const char *string, /* UTF-16 encoded string */
1254 Py_ssize_t length, /* size of string */
1255 const char *errors, /* error handling */
1256 int *byteorder, /* pointer to byteorder to use
1257 0=native;-1=LE,1=BE; updated on
1258 exit */
1259 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001260 );
1261
Guido van Rossumd8225182000-03-10 22:33:05 +00001262/* Returns a Python string using the UTF-16 encoding in native byte
1263 order. The string always starts with a BOM mark. */
1264
Mark Hammond91a681d2002-08-12 07:21:58 +00001265PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001267 );
1268
1269/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001270 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001271
1272 If byteorder is not 0, output is written according to the following
1273 byte order:
1274
1275 byteorder == -1: little endian
1276 byteorder == 0: native byte order (writes a BOM mark)
1277 byteorder == 1: big endian
1278
1279 If byteorder is 0, the output string will always start with the
1280 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1281 prepended.
1282
1283 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1284 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001285 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001286
1287*/
1288
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001289#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001290PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 const Py_UNICODE *data, /* Unicode char buffer */
1292 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1293 const char *errors, /* error handling */
1294 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001296PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1297 PyObject* unicode, /* Unicode object */
1298 const char *errors, /* error handling */
1299 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1300 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001301#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001302
1303/* --- Unicode-Escape Codecs ---------------------------------------------- */
1304
Mark Hammond91a681d2002-08-12 07:21:58 +00001305PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 const char *string, /* Unicode-Escape encoded string */
1307 Py_ssize_t length, /* size of string */
1308 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 );
1310
Mark Hammond91a681d2002-08-12 07:21:58 +00001311PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001313 );
1314
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001315#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001316PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 const Py_UNICODE *data, /* Unicode char buffer */
1318 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001319 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001320#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001321
1322/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1323
Mark Hammond91a681d2002-08-12 07:21:58 +00001324PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 const char *string, /* Raw-Unicode-Escape encoded string */
1326 Py_ssize_t length, /* size of string */
1327 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001328 );
1329
Mark Hammond91a681d2002-08-12 07:21:58 +00001330PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001332 );
1333
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001334#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001335PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 const Py_UNICODE *data, /* Unicode char buffer */
1337 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001338 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001339#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001340
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001341/* --- Unicode Internal Codec ---------------------------------------------
1342
1343 Only for internal use in _codecsmodule.c */
1344
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001345#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001346PyObject *_PyUnicode_DecodeUnicodeInternal(
1347 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001348 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001349 const char *errors
1350 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001351#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001352
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001353/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001354
1355 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1356
1357*/
1358
Mark Hammond91a681d2002-08-12 07:21:58 +00001359PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 const char *string, /* Latin-1 encoded string */
1361 Py_ssize_t length, /* size of string */
1362 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001363 );
1364
Mark Hammond91a681d2002-08-12 07:21:58 +00001365PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001367 );
1368
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001369#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1371 PyObject* unicode,
1372 const char* errors);
1373
Mark Hammond91a681d2002-08-12 07:21:58 +00001374PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 const Py_UNICODE *data, /* Unicode char buffer */
1376 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1377 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001378 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001379#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001380
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001382
1383 Only 7-bit ASCII data is excepted. All other codes generate errors.
1384
1385*/
1386
Mark Hammond91a681d2002-08-12 07:21:58 +00001387PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 const char *string, /* ASCII encoded string */
1389 Py_ssize_t length, /* size of string */
1390 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001391 );
1392
Mark Hammond91a681d2002-08-12 07:21:58 +00001393PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001395 );
1396
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001397#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1399 PyObject* unicode,
1400 const char* errors);
1401
Mark Hammond91a681d2002-08-12 07:21:58 +00001402PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 const Py_UNICODE *data, /* Unicode char buffer */
1404 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1405 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001406 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001407#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001408
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001412
1413 Decoding mappings must map single string characters to single
1414 Unicode characters, integers (which are then interpreted as Unicode
1415 ordinals) or None (meaning "undefined mapping" and causing an
1416 error).
1417
1418 Encoding mappings must map single Unicode characters to single
1419 string characters, integers (which are then interpreted as Latin-1
1420 ordinals) or None (meaning "undefined mapping" and causing an
1421 error).
1422
1423 If a character lookup fails with a LookupError, the character is
1424 copied as-is meaning that its ordinal value will be interpreted as
1425 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1426 to contain those mappings which map characters to different code
1427 points.
1428
1429*/
1430
Mark Hammond91a681d2002-08-12 07:21:58 +00001431PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001432 const char *string, /* Encoded string */
1433 Py_ssize_t length, /* size of string */
1434 PyObject *mapping, /* character mapping
1435 (char ordinal -> unicode ordinal) */
1436 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001437 );
1438
Mark Hammond91a681d2002-08-12 07:21:58 +00001439PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001440 PyObject *unicode, /* Unicode object */
1441 PyObject *mapping /* character mapping
1442 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001443 );
1444
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001445#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001446PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001447 const Py_UNICODE *data, /* Unicode char buffer */
1448 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1449 PyObject *mapping, /* character mapping
1450 (unicode ordinal -> char ordinal) */
1451 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001452 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001453PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1454 PyObject *unicode, /* Unicode object */
1455 PyObject *mapping, /* character mapping
1456 (unicode ordinal -> char ordinal) */
1457 const char *errors /* error handling */
1458 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001459#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001460
1461/* Translate a Py_UNICODE buffer of the given length by applying a
1462 character mapping table to it and return the resulting Unicode
1463 object.
1464
1465 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001466 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001467
1468 Mapping tables may be dictionaries or sequences. Unmapped character
1469 ordinals (ones which cause a LookupError) are left untouched and
1470 are copied as-is.
1471
1472*/
1473
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001474#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001475PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001476 const Py_UNICODE *data, /* Unicode char buffer */
1477 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1478 PyObject *table, /* Translate table */
1479 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001480 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001481#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001482
Victor Stinner99b95382011-07-04 14:23:54 +02001483#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001484
Guido van Rossumefec1152000-03-28 02:01:15 +00001485/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001486
Mark Hammond91a681d2002-08-12 07:21:58 +00001487PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001488 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001489 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001490 const char *errors /* error handling */
1491 );
1492
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001493PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1494 const char *string, /* MBCS encoded string */
1495 Py_ssize_t length, /* size of string */
1496 const char *errors, /* error handling */
1497 Py_ssize_t *consumed /* bytes consumed */
1498 );
1499
Victor Stinner3a50e702011-10-18 21:21:00 +02001500PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1501 int code_page, /* code page number */
1502 const char *string, /* encoded string */
1503 Py_ssize_t length, /* size of string */
1504 const char *errors, /* error handling */
1505 Py_ssize_t *consumed /* bytes consumed */
1506 );
1507
Mark Hammond91a681d2002-08-12 07:21:58 +00001508PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001509 PyObject *unicode /* Unicode object */
1510 );
1511
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001512#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001513PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001514 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001515 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001516 const char *errors /* error handling */
1517 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001518#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001519
Victor Stinner3a50e702011-10-18 21:21:00 +02001520PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1521 int code_page, /* code page number */
1522 PyObject *unicode, /* Unicode object */
1523 const char *errors /* error handling */
1524 );
1525
Victor Stinner99b95382011-07-04 14:23:54 +02001526#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001527
Guido van Rossum9e896b32000-04-05 20:11:21 +00001528/* --- Decimal Encoder ---------------------------------------------------- */
1529
1530/* Takes a Unicode string holding a decimal value and writes it into
1531 an output buffer using standard ASCII digit codes.
1532
1533 The output buffer has to provide at least length+1 bytes of storage
1534 area. The output string is 0-terminated.
1535
1536 The encoder converts whitespace to ' ', decimal characters to their
1537 corresponding ASCII digit and all other Latin-1 characters except
1538 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1539 are treated as errors. This includes embedded NULL bytes.
1540
1541 Error handling is defined by the errors argument:
1542
1543 NULL or "strict": raise a ValueError
1544 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001546 "replace": replaces illegal characters with '?'
1547
1548 Returns 0 on success, -1 on failure.
1549
1550*/
1551
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001552#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001553PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001554 Py_UNICODE *s, /* Unicode buffer */
1555 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1556 char *output, /* Output buffer; must have size >= length */
1557 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001558 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001559#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001560
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001561/* Transforms code points that have decimal digit property to the
1562 corresponding ASCII digit code points.
1563
1564 Returns a new Unicode string on success, NULL on failure.
1565*/
1566
Georg Brandlb5503082010-12-05 11:40:48 +00001567#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001568PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1569 Py_UNICODE *s, /* Unicode buffer */
1570 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1571 );
Georg Brandlb5503082010-12-05 11:40:48 +00001572#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001573
Victor Stinner6f9568b2011-11-17 00:12:44 +01001574/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 as argument instead of a raw buffer and length. This function additionally
1576 transforms spaces to ASCII because this is what the callers in longobject,
1577 floatobject, and complexobject did anyways. */
1578
1579#ifndef Py_LIMITED_API
1580PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1581 PyObject *unicode /* Unicode object */
1582 );
1583#endif
1584
Martin v. Löwis011e8422009-05-05 04:43:17 +00001585/* --- File system encoding ---------------------------------------------- */
1586
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001587/* ParseTuple converter: encode str objects to bytes using
1588 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001589
1590PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1591
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001592/* ParseTuple converter: decode bytes objects to unicode using
1593 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1594
1595PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1596
Victor Stinner77c38622010-05-14 15:58:55 +00001597/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1598 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001599
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001600 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1601 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001602
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001603 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001604*/
1605
1606PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1607 const char *s /* encoded string */
1608 );
1609
Victor Stinner77c38622010-05-14 15:58:55 +00001610/* Decode a string using Py_FileSystemDefaultEncoding
1611 and the "surrogateescape" error handler.
1612
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001613 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1614 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001615*/
1616
Martin v. Löwis011e8422009-05-05 04:43:17 +00001617PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1618 const char *s, /* encoded string */
1619 Py_ssize_t size /* size */
1620 );
1621
Victor Stinnerae6265f2010-05-15 16:27:27 +00001622/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001623 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001624
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001625 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1626 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001627*/
1628
1629PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1630 PyObject *unicode
1631 );
1632
Guido van Rossumd8225182000-03-10 22:33:05 +00001633/* --- Methods & Slots ----------------------------------------------------
1634
1635 These are capable of handling Unicode objects and strings on input
1636 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001637 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001638
1639/* Concat two strings giving a new Unicode string. */
1640
Mark Hammond91a681d2002-08-12 07:21:58 +00001641PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 PyObject *left, /* Left string */
1643 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001644 );
1645
Walter Dörwald1ab83302007-05-18 17:15:44 +00001646/* Concat two strings and put the result in *pleft
1647 (sets *pleft to NULL on error) */
1648
1649PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001650 PyObject **pleft, /* Pointer to left string */
1651 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001652 );
1653
1654/* Concat two strings, put the result in *pleft and drop the right object
1655 (sets *pleft to NULL on error) */
1656
1657PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001658 PyObject **pleft, /* Pointer to left string */
1659 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001660 );
1661
Guido van Rossumd8225182000-03-10 22:33:05 +00001662/* Split a string giving a list of Unicode strings.
1663
1664 If sep is NULL, splitting will be done at all whitespace
1665 substrings. Otherwise, splits occur at the given separator.
1666
1667 At most maxsplit splits will be done. If negative, no limit is set.
1668
1669 Separators are not included in the resulting list.
1670
1671*/
1672
Mark Hammond91a681d2002-08-12 07:21:58 +00001673PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001674 PyObject *s, /* String to split */
1675 PyObject *sep, /* String separator */
1676 Py_ssize_t maxsplit /* Maxsplit count */
1677 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001678
1679/* Dito, but split at line breaks.
1680
1681 CRLF is considered to be one line break. Line breaks are not
1682 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001683
Mark Hammond91a681d2002-08-12 07:21:58 +00001684PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001685 PyObject *s, /* String to split */
1686 int keepends /* If true, line end markers are included */
1687 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001688
Thomas Wouters477c8d52006-05-27 19:21:47 +00001689/* Partition a string using a given separator. */
1690
1691PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001692 PyObject *s, /* String to partition */
1693 PyObject *sep /* String separator */
1694 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001695
1696/* Partition a string using a given separator, searching from the end of the
1697 string. */
1698
1699PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001700 PyObject *s, /* String to partition */
1701 PyObject *sep /* String separator */
1702 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001703
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001704/* Split a string giving a list of Unicode strings.
1705
1706 If sep is NULL, splitting will be done at all whitespace
1707 substrings. Otherwise, splits occur at the given separator.
1708
1709 At most maxsplit splits will be done. But unlike PyUnicode_Split
1710 PyUnicode_RSplit splits from the end of the string. If negative,
1711 no limit is set.
1712
1713 Separators are not included in the resulting list.
1714
1715*/
1716
1717PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001718 PyObject *s, /* String to split */
1719 PyObject *sep, /* String separator */
1720 Py_ssize_t maxsplit /* Maxsplit count */
1721 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001722
Guido van Rossumd8225182000-03-10 22:33:05 +00001723/* Translate a string by applying a character mapping table to it and
1724 return the resulting Unicode object.
1725
1726 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001727 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001728
1729 Mapping tables may be dictionaries or sequences. Unmapped character
1730 ordinals (ones which cause a LookupError) are left untouched and
1731 are copied as-is.
1732
1733*/
1734
Mark Hammond91a681d2002-08-12 07:21:58 +00001735PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001736 PyObject *str, /* String */
1737 PyObject *table, /* Translate table */
1738 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001739 );
1740
1741/* Join a sequence of strings using the given separator and return
1742 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001743
Mark Hammond91a681d2002-08-12 07:21:58 +00001744PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001745 PyObject *separator, /* Separator string */
1746 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001747 );
1748
1749/* Return 1 if substr matches str[start:end] at the given tail end, 0
1750 otherwise. */
1751
Martin v. Löwis18e16552006-02-15 17:27:45 +00001752PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 PyObject *str, /* String */
1754 PyObject *substr, /* Prefix or Suffix string */
1755 Py_ssize_t start, /* Start index */
1756 Py_ssize_t end, /* Stop index */
1757 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001758 );
1759
1760/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001761 given search direction or -1 if not found. -2 is returned in case
1762 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001763
Martin v. Löwis18e16552006-02-15 17:27:45 +00001764PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 PyObject *str, /* String */
1766 PyObject *substr, /* Substring to find */
1767 Py_ssize_t start, /* Start index */
1768 Py_ssize_t end, /* Stop index */
1769 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001770 );
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772/* Like PyUnicode_Find, but search for single character only. */
1773PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1774 PyObject *str,
1775 Py_UCS4 ch,
1776 Py_ssize_t start,
1777 Py_ssize_t end,
1778 int direction
1779 );
1780
Barry Warsaw51ac5802000-03-20 16:36:48 +00001781/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001782
Martin v. Löwis18e16552006-02-15 17:27:45 +00001783PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001784 PyObject *str, /* String */
1785 PyObject *substr, /* Substring to count */
1786 Py_ssize_t start, /* Start index */
1787 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001788 );
1789
Barry Warsaw51ac5802000-03-20 16:36:48 +00001790/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001791 and return the resulting Unicode object. */
1792
Mark Hammond91a681d2002-08-12 07:21:58 +00001793PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001794 PyObject *str, /* String */
1795 PyObject *substr, /* Substring to find */
1796 PyObject *replstr, /* Substring to replace */
1797 Py_ssize_t maxcount /* Max. number of replacements to apply;
1798 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001799 );
1800
1801/* Compare two strings and return -1, 0, 1 for less than, equal,
1802 greater than resp. */
1803
Mark Hammond91a681d2002-08-12 07:21:58 +00001804PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001805 PyObject *left, /* Left string */
1806 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001807 );
1808
Martin v. Löwis5b222132007-06-10 09:51:05 +00001809PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1810 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001811 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001812 );
1813
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001814/* Rich compare two strings and return one of the following:
1815
1816 - NULL in case an exception was raised
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001817 - Py_True or Py_False for successfully comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001818 - Py_NotImplemented in case the type combination is unknown
1819
1820 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1821 case the conversion of the arguments to Unicode fails with a
1822 UnicodeDecodeError.
1823
1824 Possible values for op:
1825
1826 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1827
1828*/
1829
1830PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001831 PyObject *left, /* Left string */
1832 PyObject *right, /* Right string */
1833 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001834 );
1835
Thomas Wouters7e474022000-07-16 12:04:32 +00001836/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001837 the resulting Unicode string. */
1838
Mark Hammond91a681d2002-08-12 07:21:58 +00001839PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001840 PyObject *format, /* Format string */
1841 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001842 );
1843
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001844/* Checks whether element is contained in container and return 1/0
1845 accordingly.
1846
1847 element has to coerce to an one element Unicode string. -1 is
1848 returned in case of an error. */
1849
Mark Hammond91a681d2002-08-12 07:21:58 +00001850PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 PyObject *container, /* Container string */
1852 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001853 );
1854
Martin v. Löwis47383402007-08-15 07:32:56 +00001855/* Checks whether argument is a valid identifier. */
1856
1857PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1858
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001859#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001860/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001861PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001862 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001863 int striptype,
1864 PyObject *sepobj
1865 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001866#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001867
Eric Smith5807c412008-05-11 21:00:57 +00001868/* Using the current locale, insert the thousands grouping
1869 into the string pointed to by buffer. For the argument descriptions,
1870 see Objects/stringlib/localeutil.h */
1871
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001872#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001873PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1874 Py_ssize_t n_buffer,
1875 Py_UNICODE *digits,
1876 Py_ssize_t n_digits,
1877 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001878#endif
Eric Smith5807c412008-05-11 21:00:57 +00001879
Eric Smitha3b1ac82009-04-03 14:45:06 +00001880/* Using explicit passed-in values, insert the thousands grouping
1881 into the string pointed to by buffer. For the argument descriptions,
1882 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001883#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02001885 PyObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 int kind,
1887 void *buffer,
1888 Py_ssize_t n_buffer,
1889 void *digits,
1890 Py_ssize_t n_digits,
1891 Py_ssize_t min_width,
1892 const char *grouping,
1893 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001894#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001895/* === Characters Type APIs =============================================== */
1896
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001897/* Helper array used by Py_UNICODE_ISSPACE(). */
1898
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001899#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001900PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1901
Guido van Rossumd8225182000-03-10 22:33:05 +00001902/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001903 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001904
1905 These APIs are implemented in Objects/unicodectype.c.
1906
1907*/
1908
Mark Hammond91a681d2002-08-12 07:21:58 +00001909PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001910 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001911 );
1912
Mark Hammond91a681d2002-08-12 07:21:58 +00001913PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001914 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001915 );
1916
Mark Hammond91a681d2002-08-12 07:21:58 +00001917PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001918 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001919 );
1920
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001921PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001922 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001923 );
1924
1925PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001926 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001927 );
1928
Mark Hammond91a681d2002-08-12 07:21:58 +00001929PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001930 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001931 );
1932
Mark Hammond91a681d2002-08-12 07:21:58 +00001933PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001934 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001935 );
1936
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001937PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1938 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001939 );
1940
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001941PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1942 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001943 );
1944
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001945PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1946 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001947 );
1948
Mark Hammond91a681d2002-08-12 07:21:58 +00001949PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001950 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001951 );
1952
Mark Hammond91a681d2002-08-12 07:21:58 +00001953PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001954 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001955 );
1956
Mark Hammond91a681d2002-08-12 07:21:58 +00001957PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001958 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001959 );
1960
Mark Hammond91a681d2002-08-12 07:21:58 +00001961PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001962 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001963 );
1964
Mark Hammond91a681d2002-08-12 07:21:58 +00001965PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001966 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001967 );
1968
Mark Hammond91a681d2002-08-12 07:21:58 +00001969PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001970 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001971 );
1972
Georg Brandl559e5d72008-06-11 18:37:52 +00001973PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001974 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001975 );
1976
Mark Hammond91a681d2002-08-12 07:21:58 +00001977PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001978 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001979 );
1980
Victor Stinneref8d95c2010-08-16 22:03:11 +00001981PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1982 const Py_UNICODE *u
1983 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001984
1985PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001986 Py_UNICODE *s1,
1987 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001988
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001989PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1990 Py_UNICODE *s1, const Py_UNICODE *s2);
1991
Martin v. Löwis5b222132007-06-10 09:51:05 +00001992PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001993 Py_UNICODE *s1,
1994 const Py_UNICODE *s2,
1995 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001996
1997PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001998 const Py_UNICODE *s1,
1999 const Py_UNICODE *s2
2000 );
2001
2002PyAPI_FUNC(int) Py_UNICODE_strncmp(
2003 const Py_UNICODE *s1,
2004 const Py_UNICODE *s2,
2005 size_t n
2006 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002007
2008PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002009 const Py_UNICODE *s,
2010 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002011 );
2012
Victor Stinner331ea922010-08-10 16:37:20 +00002013PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002014 const Py_UNICODE *s,
2015 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002016 );
2017
Victor Stinner71133ff2010-09-01 23:43:53 +00002018/* Create a copy of a unicode string ending with a nul character. Return NULL
2019 and raise a MemoryError exception on memory allocation failure, otherwise
2020 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2021
Victor Stinner46408602010-09-03 16:18:00 +00002022PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002023 PyObject *unicode
2024 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002025#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002026
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002027#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002028PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002029 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002030 int check_content);
2031#endif
2032
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002033/********************* String Literals ****************************************/
2034/* This structure helps managing static strings. The basic usage goes like this:
2035 Instead of doing
2036
2037 r = PyObject_CallMethod(o, "foo", "args", ...);
2038
2039 do
2040
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002041 _Py_IDENTIFIER(foo);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002042 ...
2043 r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
2044
2045 PyId_foo is a static variable, either on block level or file level. On first
2046 usage, the string "foo" is interned, and the structures are linked. On interpreter
2047 shutdown, all strings are released (through _PyUnicode_ClearStaticStrings).
2048
2049 Alternatively, _Py_static_string allows to choose the variable name.
Martin v. Löwisd10759f2011-11-07 13:00:05 +01002050 _PyUnicode_FromId returns a borrowed reference to the interned string.
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002051 _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
2052*/
2053typedef struct _Py_Identifier {
2054 struct _Py_Identifier *next;
2055 const char* string;
2056 PyObject *object;
2057} _Py_Identifier;
2058
Martin v. Löwis87da8722011-10-09 11:54:42 +02002059#define _Py_static_string(varname, value) static _Py_Identifier varname = { 0, value, 0 }
Martin v. Löwisbd928fe2011-10-14 10:20:37 +02002060#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002061
2062/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2063PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2064/* Clear all static strings. */
2065PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2066
Guido van Rossumd8225182000-03-10 22:33:05 +00002067#ifdef __cplusplus
2068}
2069#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002070#endif /* !Py_UNICODEOBJECT_H */