blob: bc6ecd4e81e3f2f2bf976c33814ee23e4974d527 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
Georg Brandlc6bc4c62011-10-05 16:23:09 +020088 With PEP 393, Py_UNICODE is deprecated and replaced with a
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020089 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109# include <time.h>
110# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000111# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000112#endif
113
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200114/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200115 unicode representations. */
Benjamin Petersona13e3672016-09-08 11:38:28 -0700116typedef uint32_t Py_UCS4;
117typedef uint16_t Py_UCS2;
118typedef uint8_t Py_UCS1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200119
Guido van Rossumd8225182000-03-10 22:33:05 +0000120/* --- Internal Unicode Operations ---------------------------------------- */
121
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000122/* Since splitting on whitespace is an important use case, and
123 whitespace in most situations is solely ASCII whitespace, we
124 optimize for the common case by using a quick look-up table
125 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000126
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000127 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000128#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000129#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000131
132#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
133#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
134#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
135#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
136
137#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
138#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
139#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
140
141#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
142#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
143#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000144#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000145
146#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
147#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
148#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
149
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000150#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000151
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000152#define Py_UNICODE_ISALNUM(ch) \
153 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 Py_UNICODE_ISDECIMAL(ch) || \
155 Py_UNICODE_ISDIGIT(ch) || \
156 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200158#define Py_UNICODE_COPY(target, source, length) \
Christian Heimesf051e432016-09-13 20:22:02 +0200159 memcpy((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000160
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000161#define Py_UNICODE_FILL(target, value, length) \
162 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000164 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000165
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300166/* macros to work with surrogates */
Victor Stinner76df43d2012-10-30 01:42:39 +0100167#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
168#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
169#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300170/* Join two surrogate characters and return a single Py_UCS4 value. */
171#define Py_UNICODE_JOIN_SURROGATES(high, low) \
172 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
173 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
Victor Stinner551ac952011-11-29 22:58:13 +0100174/* high surrogate = top 10 bits added to D800 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200175#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
Victor Stinner551ac952011-11-29 22:58:13 +0100176/* low surrogate = bottom 10 bits added to DC00 */
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200177#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300178
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000179/* Check if substring matches at given offset. The offset must be
180 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000181
Thomas Wouters477c8d52006-05-27 19:21:47 +0000182#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200183 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
184 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
185 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
186
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000187#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000188
Barry Warsaw51ac5802000-03-20 16:36:48 +0000189#ifdef __cplusplus
190extern "C" {
191#endif
192
Guido van Rossumd8225182000-03-10 22:33:05 +0000193/* --- Unicode Type ------------------------------------------------------- */
194
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000195#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200196
197/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
198 structure. state.ascii and state.compact are set, and the data
199 immediately follow the structure. utf8_length and wstr_length can be found
200 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000201typedef struct {
Éric Araujo80a348c2011-10-05 01:11:12 +0200202 /* There are 4 forms of Unicode strings:
Victor Stinner910337b2011-10-03 03:20:16 +0200203
204 - compact ascii:
205
206 * structure = PyASCIIObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100207 * test: PyUnicode_IS_COMPACT_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200208 * kind = PyUnicode_1BYTE_KIND
209 * compact = 1
210 * ascii = 1
211 * ready = 1
Victor Stinner30134f52011-10-04 01:32:45 +0200212 * (length is the length of the utf8 and wstr strings)
213 * (data starts just after the structure)
214 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
Victor Stinner910337b2011-10-03 03:20:16 +0200215
216 - compact:
217
218 * structure = PyCompactUnicodeObject
Victor Stinner80bc72d2011-12-22 03:23:10 +0100219 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
Victor Stinner910337b2011-10-03 03:20:16 +0200220 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
221 PyUnicode_4BYTE_KIND
222 * compact = 1
223 * ready = 1
Victor Stinnera3b334d2011-10-03 13:53:37 +0200224 * ascii = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200225 * utf8 is not shared with data
Victor Stinnera41463c2011-10-04 01:05:08 +0200226 * utf8_length = 0 if utf8 is NULL
227 * wstr is shared with data and wstr_length=length
228 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
Victor Stinnere30c0a12011-11-04 20:54:05 +0100229 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
Victor Stinnera41463c2011-10-04 01:05:08 +0200230 * wstr_length = 0 if wstr is NULL
Victor Stinner30134f52011-10-04 01:32:45 +0200231 * (data starts just after the structure)
Victor Stinner910337b2011-10-03 03:20:16 +0200232
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200233 - legacy string, not ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200234
235 * structure = PyUnicodeObject
Victor Stinner7a9105a2011-12-12 00:13:42 +0100236 * test: kind == PyUnicode_WCHAR_KIND
Victor Stinnere30c0a12011-11-04 20:54:05 +0100237 * length = 0 (use wstr_length)
238 * hash = -1
Victor Stinner910337b2011-10-03 03:20:16 +0200239 * kind = PyUnicode_WCHAR_KIND
240 * compact = 0
Victor Stinner30134f52011-10-04 01:32:45 +0200241 * ascii = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200242 * ready = 0
Victor Stinnere30c0a12011-11-04 20:54:05 +0100243 * interned = SSTATE_NOT_INTERNED
Victor Stinner910337b2011-10-03 03:20:16 +0200244 * wstr is not NULL
245 * data.any is NULL
246 * utf8 is NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200247 * utf8_length = 0
Victor Stinner910337b2011-10-03 03:20:16 +0200248
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200249 - legacy string, ready:
Victor Stinner910337b2011-10-03 03:20:16 +0200250
251 * structure = PyUnicodeObject structure
Victor Stinner7a9105a2011-12-12 00:13:42 +0100252 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200253 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
254 PyUnicode_4BYTE_KIND
255 * compact = 0
256 * ready = 1
257 * data.any is not NULL
Victor Stinnera41463c2011-10-04 01:05:08 +0200258 * utf8 is shared and utf8_length = length with data.any if ascii = 1
259 * utf8_length = 0 if utf8 is NULL
Victor Stinnere30c0a12011-11-04 20:54:05 +0100260 * wstr is shared with data.any and wstr_length = length
Victor Stinnera41463c2011-10-04 01:05:08 +0200261 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
262 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
263 * wstr_length = 0 if wstr is NULL
Victor Stinner910337b2011-10-03 03:20:16 +0200264
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200265 Compact strings use only one memory block (structure + characters),
266 whereas legacy strings use one block for the structure and one block
267 for characters.
Victor Stinner910337b2011-10-03 03:20:16 +0200268
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200269 Legacy strings are created by PyUnicode_FromUnicode() and
270 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
271 when PyUnicode_READY() is called.
272
273 See also _PyUnicode_CheckConsistency().
274 */
Guido van Rossumd8225182000-03-10 22:33:05 +0000275 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200276 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000277 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200278 struct {
279 /*
280 SSTATE_NOT_INTERNED (0)
281 SSTATE_INTERNED_MORTAL (1)
282 SSTATE_INTERNED_IMMORTAL (2)
283
284 If interned != SSTATE_NOT_INTERNED, the two references from the
285 dictionary to this object are *not* counted in ob_refcnt.
286 */
287 unsigned int interned:2;
288 /* Character size:
289
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200290 - PyUnicode_WCHAR_KIND (0):
291
292 * character type = wchar_t (16 or 32 bits, depending on the
293 platform)
294
295 - PyUnicode_1BYTE_KIND (1):
296
297 * character type = Py_UCS1 (8 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100298 * all characters are in the range U+0000-U+00FF (latin1)
299 * if ascii is set, all characters are in the range U+0000-U+007F
300 (ASCII), otherwise at least one character is in the range
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200301 U+0080-U+00FF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200302
303 - PyUnicode_2BYTE_KIND (2):
304
305 * character type = Py_UCS2 (16 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100306 * all characters are in the range U+0000-U+FFFF (BMP)
307 * at least one character is in the range U+0100-U+FFFF
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200308
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200309 - PyUnicode_4BYTE_KIND (4):
Victor Stinner4d0d54b2011-10-05 01:31:05 +0200310
311 * character type = Py_UCS4 (32 bits, unsigned)
Victor Stinner77faf692011-11-20 18:56:05 +0100312 * all characters are in the range U+0000-U+10FFFF
313 * at least one character is in the range U+10000-U+10FFFF
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200315 unsigned int kind:3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Compact is with respect to the allocation scheme. Compact unicode
317 objects only require one memory block while non-compact objects use
318 one block for the PyUnicodeObject struct and another for its data
319 buffer. */
320 unsigned int compact:1;
Victor Stinner77faf692011-11-20 18:56:05 +0100321 /* The string only contains characters in the range U+0000-U+007F (ASCII)
Victor Stinner1d4b35f2011-10-06 01:51:19 +0200322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323 set, use the PyASCIIObject structure. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 unsigned int ascii:1;
325 /* The ready flag indicates whether the object layout is initialized
326 completely. This means that this is either a compact object, or
327 the data pointer is filled out. The bit is redundant, and helps
328 to minimize the test in PyUnicode_IS_READY(). */
329 unsigned int ready:1;
Antoine Pitrou8c6f8dc2014-03-23 22:55:03 +0100330 /* Padding to ensure that PyUnicode_DATA() is always aligned to
331 4 bytes (see issue #19537 on m68k). */
332 unsigned int :24;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 } state;
334 wchar_t *wstr; /* wchar_t representation (null-terminated) */
335} PyASCIIObject;
336
337/* Non-ASCII strings allocated through PyUnicode_New use the
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200338 PyCompactUnicodeObject structure. state.compact is set, and the data
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 immediately follow the structure. */
340typedef struct {
341 PyASCIIObject _base;
342 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
343 * terminating \0. */
344 char *utf8; /* UTF-8 representation (null-terminated) */
345 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
346 * surrogates count as two code points. */
347} PyCompactUnicodeObject;
348
349/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
350 PyUnicodeObject structure. The actual string data is initially in the wstr
Victor Stinnera3b334d2011-10-03 13:53:37 +0200351 block, and copied into the data block using _PyUnicode_Ready. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200352typedef struct {
353 PyCompactUnicodeObject _base;
354 union {
355 void *any;
356 Py_UCS1 *latin1;
357 Py_UCS2 *ucs2;
358 Py_UCS4 *ucs4;
359 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000360} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000361#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000362
Mark Hammond91a681d2002-08-12 07:21:58 +0000363PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000364PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000365
Thomas Wouters27d517b2007-02-25 20:39:11 +0000366#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000367 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
368#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000369
370/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000371#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372
373#define PyUnicode_WSTR_LENGTH(op) \
Victor Stinnera3b334d2011-10-03 13:53:37 +0200374 (PyUnicode_IS_COMPACT_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 ((PyASCIIObject*)op)->length : \
376 ((PyCompactUnicodeObject*)op)->wstr_length)
377
378/* Returns the deprecated Py_UNICODE representation's size in code units
379 (this includes surrogate pairs as 2 units).
380 If the Py_UNICODE representation is not available, it will be computed
381 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
382
Victor Stinnerf3ae6202011-11-21 02:24:49 +0100383#define PyUnicode_GET_SIZE(op) \
384 (assert(PyUnicode_Check(op)), \
385 (((PyASCIIObject *)(op))->wstr) ? \
386 PyUnicode_WSTR_LENGTH(op) : \
387 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
388 assert(((PyASCIIObject *)(op))->wstr), \
389 PyUnicode_WSTR_LENGTH(op)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200390
Guido van Rossumd8225182000-03-10 22:33:05 +0000391#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
393
394/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
395 representation on demand. Using this macro is very inefficient now,
396 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
397 use PyUnicode_WRITE() and PyUnicode_READ(). */
398
Guido van Rossumd8225182000-03-10 22:33:05 +0000399#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200400 (assert(PyUnicode_Check(op)), \
401 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
402 PyUnicode_AsUnicode((PyObject *)(op)))
403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 ((const char *)(PyUnicode_AS_UNICODE(op)))
406
407
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200408/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200409
Victor Stinner6f9568b2011-11-17 00:12:44 +0100410/* Values for PyASCIIObject.state: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
412/* Interning state. */
413#define SSTATE_NOT_INTERNED 0
414#define SSTATE_INTERNED_MORTAL 1
415#define SSTATE_INTERNED_IMMORTAL 2
416
Victor Stinnera3b334d2011-10-03 13:53:37 +0200417/* Return true if the string contains only ASCII characters, or 0 if not. The
Victor Stinner24c74be2011-12-12 01:24:20 +0100418 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
419 ready. */
420#define PyUnicode_IS_ASCII(op) \
421 (assert(PyUnicode_Check(op)), \
422 assert(PyUnicode_IS_READY(op)), \
423 ((PyASCIIObject*)op)->state.ascii)
Victor Stinnera3b334d2011-10-03 13:53:37 +0200424
425/* Return true if the string is compact or 0 if not.
426 No type checks or Ready calls are performed. */
427#define PyUnicode_IS_COMPACT(op) \
428 (((PyASCIIObject*)(op))->state.compact)
429
430/* Return true if the string is a compact ASCII string (use PyASCIIObject
431 structure), or 0 if not. No type checks or Ready calls are performed. */
432#define PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner24c74be2011-12-12 01:24:20 +0100433 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200434
Victor Stinner52e2cc82011-12-19 22:14:45 +0100435enum PyUnicode_Kind {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436/* String contains only wstr byte characters. This is only possible
Victor Stinnera3b334d2011-10-03 13:53:37 +0200437 when the string was created with a legacy API and _PyUnicode_Ready()
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200438 has not been called yet. */
Victor Stinner52e2cc82011-12-19 22:14:45 +0100439 PyUnicode_WCHAR_KIND = 0,
440/* Return values of the PyUnicode_KIND() macro: */
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200441 PyUnicode_1BYTE_KIND = 1,
442 PyUnicode_2BYTE_KIND = 2,
443 PyUnicode_4BYTE_KIND = 4
444};
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445
Georg Brandl4975a9b2011-10-05 16:12:21 +0200446/* Return pointers to the canonical representation cast to unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 Py_UCS2, or Py_UCS4 for direct character access.
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200448 No checks are performed, use PyUnicode_KIND() before to ensure
449 these will work correctly. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
Victor Stinner157f83f2011-09-28 21:41:31 +0200455/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200456#define PyUnicode_KIND(op) \
457 (assert(PyUnicode_Check(op)), \
458 assert(PyUnicode_IS_READY(op)), \
459 ((PyASCIIObject *)(op))->state.kind)
460
Victor Stinner157f83f2011-09-28 21:41:31 +0200461/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462#define _PyUnicode_COMPACT_DATA(op) \
Victor Stinner55c7e002011-10-18 23:32:53 +0200463 (PyUnicode_IS_ASCII(op) ? \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464 ((void*)((PyASCIIObject*)(op) + 1)) : \
465 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op) \
468 (assert(((PyUnicodeObject*)(op))->data.any), \
469 ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472 (assert(PyUnicode_Check(op)), \
473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
474 _PyUnicode_NONCOMPACT_DATA(op))
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476/* In the access macros below, "kind" may be evaluated more than once.
477 All other macro parameters are evaluated exactly once, so it is safe
478 to put side effects into them (such as increasing the index). */
479
480/* Write into the canonical representation, this macro does not do any sanity
481 checks and is intended for usage in loops. The caller should cache the
Georg Brandl07de3252011-10-05 16:47:38 +0200482 kind and data pointers obtained from other macro calls.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200483 index is the index in the string (starts at 0) and value is the new
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200484 code point value which should be written to that location. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200485#define PyUnicode_WRITE(kind, data, index, value) \
486 do { \
487 switch ((kind)) { \
488 case PyUnicode_1BYTE_KIND: { \
489 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
490 break; \
491 } \
492 case PyUnicode_2BYTE_KIND: { \
493 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
494 break; \
495 } \
496 default: { \
497 assert((kind) == PyUnicode_4BYTE_KIND); \
498 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
499 } \
500 } \
501 } while (0)
502
Georg Brandl07de3252011-10-05 16:47:38 +0200503/* Read a code point from the string's canonical representation. No checks
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 or ready calls are performed. */
505#define PyUnicode_READ(kind, data, index) \
506 ((Py_UCS4) \
507 ((kind) == PyUnicode_1BYTE_KIND ? \
Victor Stinner7a48ff72011-10-02 00:55:25 +0200508 ((const Py_UCS1 *)(data))[(index)] : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200509 ((kind) == PyUnicode_2BYTE_KIND ? \
510 ((const Py_UCS2 *)(data))[(index)] : \
511 ((const Py_UCS4 *)(data))[(index)] \
512 ) \
513 ))
514
515/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
516 calls PyUnicode_KIND() and might call it twice. For single reads, use
517 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
518 cache kind and use PyUnicode_READ instead. */
519#define PyUnicode_READ_CHAR(unicode, index) \
Victor Stinner37943762011-10-02 20:33:18 +0200520 (assert(PyUnicode_Check(unicode)), \
521 assert(PyUnicode_IS_READY(unicode)), \
522 (Py_UCS4) \
523 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
524 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
525 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
526 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
527 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
528 ) \
529 ))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530
531/* Returns the length of the unicode string. The caller has to make sure that
532 the string has it's canonical representation set before calling
533 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
534#define PyUnicode_GET_LENGTH(op) \
535 (assert(PyUnicode_Check(op)), \
536 assert(PyUnicode_IS_READY(op)), \
537 ((PyASCIIObject *)(op))->length)
538
539
540/* Fast check to determine whether an object is ready. Equivalent to
541 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
542
543#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
544
Victor Stinnera3b334d2011-10-03 13:53:37 +0200545/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 case. If the canonical representation is not yet set, it will still call
Victor Stinnera3b334d2011-10-03 13:53:37 +0200547 _PyUnicode_Ready().
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548 Returns 0 on success and -1 on errors. */
549#define PyUnicode_READY(op) \
550 (assert(PyUnicode_Check(op)), \
551 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200552 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200554/* Return a maximum character value which is suitable for creating another
555 string based on op. This is always an approximation but more efficient
Georg Brandlc6bc4c62011-10-05 16:23:09 +0200556 than iterating over the string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557#define PyUnicode_MAX_CHAR_VALUE(op) \
558 (assert(PyUnicode_IS_READY(op)), \
Victor Stinner88131042011-10-13 01:12:01 +0200559 (PyUnicode_IS_ASCII(op) ? \
560 (0x7f) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200562 (0xffU) : \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200563 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
Victor Stinner88131042011-10-13 01:12:01 +0200564 (0xffffU) : \
565 (0x10ffffU)))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000567#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000568
569/* --- Constants ---------------------------------------------------------- */
570
571/* This Unicode character will be used as replacement character during
572 decoding if the errors argument is set to "replace". Note: the
573 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
574 Unicode 3.0. */
575
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200576#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000577
578/* === Public API ========================================================= */
579
580/* --- Plain Py_UNICODE --------------------------------------------------- */
581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582/* With PEP 393, this is the recommended way to allocate a new unicode object.
583 This function will allocate the object and its buffer in a single memory
584 block. Objects created using this function are not resizable. */
585#ifndef Py_LIMITED_API
586PyAPI_FUNC(PyObject*) PyUnicode_New(
587 Py_ssize_t size, /* Number of code points in the new string */
588 Py_UCS4 maxchar /* maximum code point value in the string */
589 );
590#endif
591
Benjamin Peterson82f34ad2015-01-13 09:17:24 -0500592/* Initializes the canonical string representation from the deprecated
Victor Stinnerd8f65102011-09-29 19:43:17 +0200593 wstr/Py_UNICODE representation. This function is used to convert Unicode
594 objects which were created using the old API to the new flexible format
595 introduced with PEP 393.
596
597 Don't call this function directly, use the public PyUnicode_READY() macro
598 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200599#ifndef Py_LIMITED_API
600PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200601 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602 );
603#endif
604
Victor Stinner034f6cf2011-09-30 02:26:44 +0200605/* Get a copy of a Unicode string. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100606#ifndef Py_LIMITED_API
607PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
Victor Stinner034f6cf2011-09-30 02:26:44 +0200608 PyObject *unicode
609 );
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100610#endif
Victor Stinner034f6cf2011-09-30 02:26:44 +0200611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612/* Copy character from one unicode object into another, this function performs
Victor Stinner3fe55312012-01-04 00:33:50 +0100613 character conversion when necessary and falls back to memcpy() if possible.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614
Victor Stinner3fe55312012-01-04 00:33:50 +0100615 Fail if to is too small (smaller than *how_many* or smaller than
Victor Stinnera0702ab2011-09-29 14:14:38 +0200616 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
Victor Stinner3fe55312012-01-04 00:33:50 +0100617 kind(to), or if *to* has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618
619 Return the number of written character, or return -1 and raise an exception
620 on error.
621
622 Pseudo-code:
623
624 how_many = min(how_many, len(from) - from_start)
625 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
626 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200627
628 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200629 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200631PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632 PyObject *to,
633 Py_ssize_t to_start,
634 PyObject *from,
635 Py_ssize_t from_start,
636 Py_ssize_t how_many
637 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200638
639/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
640 may crash if parameters are invalid (e.g. if the output string
641 is too short). */
642PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
643 PyObject *to,
644 Py_ssize_t to_start,
645 PyObject *from,
646 Py_ssize_t from_start,
647 Py_ssize_t how_many
648 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649#endif
650
Victor Stinnerd3f08822012-05-29 12:57:52 +0200651#ifndef Py_LIMITED_API
Victor Stinner3fe55312012-01-04 00:33:50 +0100652/* Fill a string with a character: write fill_char into
653 unicode[start:start+length].
654
655 Fail if fill_char is bigger than the string maximum character, or if the
656 string has more than 1 reference.
657
658 Return the number of written character, or return -1 and raise an exception
659 on error. */
Victor Stinner3fe55312012-01-04 00:33:50 +0100660PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
661 PyObject *unicode,
662 Py_ssize_t start,
663 Py_ssize_t length,
664 Py_UCS4 fill_char
665 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200666
667/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
668 if parameters are invalid (e.g. if length is longer than the string). */
669PyAPI_FUNC(void) _PyUnicode_FastFill(
670 PyObject *unicode,
671 Py_ssize_t start,
672 Py_ssize_t length,
673 Py_UCS4 fill_char
674 );
Victor Stinner3fe55312012-01-04 00:33:50 +0100675#endif
676
Guido van Rossumd8225182000-03-10 22:33:05 +0000677/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000679
680 u may be NULL which causes the contents to be undefined. It is the
681 user's responsibility to fill in the needed data afterwards. Note
682 that modifying the Unicode object contents after construction is
683 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000684
685 The buffer is copied into the new object. */
686
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000687#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000688PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000689 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000690 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000691 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000692#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000693
Georg Brandl952867a2010-06-27 10:17:12 +0000694/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000695PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000696 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000697 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000698 );
699
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000700/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000702PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000703 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000704 );
705
Victor Stinnerd3f08822012-05-29 12:57:52 +0200706#ifndef Py_LIMITED_API
Victor Stinnerb9275c12011-10-05 14:01:42 +0200707/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
708 Scan the string to find the maximum character. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
710 int kind,
711 const void *buffer,
712 Py_ssize_t size);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200713
714/* Create a new string from a buffer of ASCII characters.
715 WARNING: Don't check if the string contains any non-ASCII character. */
716PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
717 const char *buffer,
718 Py_ssize_t size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719#endif
720
721PyAPI_FUNC(PyObject*) PyUnicode_Substring(
722 PyObject *str,
723 Py_ssize_t start,
724 Py_ssize_t end);
725
Victor Stinnerece58de2012-04-23 23:36:38 +0200726#ifndef Py_LIMITED_API
727/* Compute the maximum character of the substring unicode[start:end].
728 Return 127 for an empty string. */
729PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
730 PyObject *unicode,
731 Py_ssize_t start,
732 Py_ssize_t end);
733#endif
734
Georg Brandldb6c7f52011-10-07 11:19:11 +0200735/* Copy the string into a UCS4 buffer including the null character if copy_null
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 is set. Return NULL and raise an exception on error. Raise a ValueError if
737 the buffer is smaller than the string. Return buffer on success.
738
739 buflen is the length of the buffer in (Py_UCS4) characters. */
740PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
741 PyObject *unicode,
742 Py_UCS4* buffer,
743 Py_ssize_t buflen,
744 int copy_null);
745
746/* Copy the string into a UCS4 buffer. A new buffer is allocated using
747 * PyMem_Malloc; if this fails, NULL is returned with a memory error
748 exception set. */
749PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
750
Guido van Rossumd8225182000-03-10 22:33:05 +0000751/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200752 Py_UNICODE buffer.
753 If the wchar_t/Py_UNICODE representation is not yet available, this
754 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000755
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000756#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000757PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000759 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000760#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762/* Return a read-only pointer to the Unicode object's internal
763 Py_UNICODE buffer and save the length at size.
764 If the wchar_t/Py_UNICODE representation is not yet available, this
765 function will calculate it. */
766
767#ifndef Py_LIMITED_API
768PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
769 PyObject *unicode, /* Unicode object */
770 Py_ssize_t *size /* location where to save the length */
771 );
772#endif
773
Guido van Rossumd8225182000-03-10 22:33:05 +0000774/* Get the length of the Unicode object. */
775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200776PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
777 PyObject *unicode
778);
779
Victor Stinner157f83f2011-09-28 21:41:31 +0200780/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 string representation. */
782
Martin v. Löwis18e16552006-02-15 17:27:45 +0000783PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000785 );
786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787/* Read a character from the string. */
788
789PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
790 PyObject *unicode,
791 Py_ssize_t index
792 );
793
794/* Write a character to the string. The string must have been created through
Victor Stinnercd9950f2011-10-02 00:34:53 +0200795 PyUnicode_New, must not be shared, and must not have been hashed yet.
796
797 Return 0 on success, -1 on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798
799PyAPI_FUNC(int) PyUnicode_WriteChar(
800 PyObject *unicode,
801 Py_ssize_t index,
802 Py_UCS4 character
803 );
804
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000805#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000806/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000807PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000808#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000809
Martin Panter6245cb32016-04-15 02:14:19 +0000810/* Resize a Unicode object. The length is the number of characters, except
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100811 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
812 is the number of Py_UNICODE characters.
Guido van Rossum52c23592000-04-10 13:41:41 +0000813
814 *unicode is modified to point to the new (resized) object and 0
815 returned on success.
816
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100817 Try to resize the string in place (which is usually faster than allocating
818 a new string and copy characters), or create a new string.
Guido van Rossum52c23592000-04-10 13:41:41 +0000819
820 Error handling is implemented as follows: an exception is set, -1
Victor Stinner16e6a802011-12-12 13:24:15 +0100821 is returned and *unicode left untouched.
822
823 WARNING: The function doesn't check string content, the result may not be a
824 string in canonical representation. */
Guido van Rossum52c23592000-04-10 13:41:41 +0000825
Mark Hammond91a681d2002-08-12 07:21:58 +0000826PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 PyObject **unicode, /* Pointer to the Unicode object */
828 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000829 );
830
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300831/* Decode obj to a Unicode object.
Guido van Rossumd8225182000-03-10 22:33:05 +0000832
Martin Panter20d32552016-04-15 00:56:21 +0000833 bytes, bytearray and other bytes-like objects are decoded according to the
834 given encoding and error handler. The encoding and error handler can be
835 NULL to have the interface use UTF-8 and "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000836
Martin Panter20d32552016-04-15 00:56:21 +0000837 All other objects (including Unicode objects) raise an exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000838
839 The API returns NULL in case of an error. The caller is responsible
840 for decref'ing the returned objects.
841
842*/
843
Mark Hammond91a681d2002-08-12 07:21:58 +0000844PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200845 PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000846 const char *encoding, /* encoding */
847 const char *errors /* error handling */
848 );
849
Martin Panter20d32552016-04-15 00:56:21 +0000850/* Copy an instance of a Unicode subtype to a new true Unicode object if
851 necessary. If obj is already a true Unicode object (not a subtype), return
852 the reference with *incremented* refcount.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000853
854 The API returns NULL in case of an error. The caller is responsible
855 for decref'ing the returned objects.
856
857*/
858
Mark Hammond91a681d2002-08-12 07:21:58 +0000859PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200860 PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000861 );
862
Victor Stinner1205f272010-09-11 00:54:47 +0000863PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
864 const char *format, /* ASCII-encoded string */
865 va_list vargs
866 );
867PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
868 const char *format, /* ASCII-encoded string */
869 ...
870 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000872#ifndef Py_LIMITED_API
Victor Stinnerd3f08822012-05-29 12:57:52 +0200873typedef struct {
874 PyObject *buffer;
875 void *data;
876 enum PyUnicode_Kind kind;
877 Py_UCS4 maxchar;
878 Py_ssize_t size;
879 Py_ssize_t pos;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200880
881 /* minimum number of allocated characters (default: 0) */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200882 Py_ssize_t min_length;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200883
884 /* minimum character (default: 127, ASCII) */
885 Py_UCS4 min_char;
886
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200887 /* If non-zero, overallocate the buffer (default: 0). */
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200888 unsigned char overallocate;
Victor Stinner8f674cc2013-04-17 23:02:17 +0200889
Victor Stinnerd7b7c742012-06-04 22:52:12 +0200890 /* If readonly is 1, buffer is a shared string (cannot be modified)
891 and size is set to 0. */
892 unsigned char readonly;
Victor Stinnerd3f08822012-05-29 12:57:52 +0200893} _PyUnicodeWriter ;
894
895/* Initialize a Unicode writer.
Victor Stinner8f674cc2013-04-17 23:02:17 +0200896 *
897 * By default, the minimum buffer size is 0 character and overallocation is
898 * disabled. Set min_length, min_char and overallocate attributes to control
899 * the allocation of the buffer. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200900PyAPI_FUNC(void)
Victor Stinner8f674cc2013-04-17 23:02:17 +0200901_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +0200902
903/* Prepare the buffer to write 'length' characters
904 with the specified maximum character.
905
906 Return 0 on success, raise an exception and return -1 on error. */
907#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
908 (((MAXCHAR) <= (WRITER)->maxchar \
909 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
910 ? 0 \
911 : (((LENGTH) == 0) \
912 ? 0 \
913 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
914
915/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
916 instead. */
917PyAPI_FUNC(int)
918_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
919 Py_ssize_t length, Py_UCS4 maxchar);
920
Victor Stinnerca9381e2015-09-22 00:58:32 +0200921/* Prepare the buffer to have at least the kind KIND.
922 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
923 support characters in range U+000-U+FFFF.
924
925 Return 0 on success, raise an exception and return -1 on error. */
926#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
927 (assert((KIND) != PyUnicode_WCHAR_KIND), \
928 (KIND) <= (WRITER)->kind \
929 ? 0 \
930 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
931
932/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
933 macro instead. */
934PyAPI_FUNC(int)
935_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
936 enum PyUnicode_Kind kind);
937
Victor Stinnera0dd0212013-04-11 22:09:04 +0200938/* Append a Unicode character.
939 Return 0 on success, raise an exception and return -1 on error. */
940PyAPI_FUNC(int)
941_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
942 Py_UCS4 ch
943 );
944
Victor Stinnere215d962012-10-06 23:03:36 +0200945/* Append a Unicode string.
946 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200947PyAPI_FUNC(int)
Victor Stinnere215d962012-10-06 23:03:36 +0200948_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
949 PyObject *str /* Unicode string */
950 );
Victor Stinnerd3f08822012-05-29 12:57:52 +0200951
Victor Stinnercfc4c132013-04-03 01:48:39 +0200952/* Append a substring of a Unicode string.
953 Return 0 on success, raise an exception and return -1 on error. */
954PyAPI_FUNC(int)
955_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
956 PyObject *str, /* Unicode string */
957 Py_ssize_t start,
958 Py_ssize_t end
959 );
960
Serhiy Storchakad65c9492015-11-02 14:10:23 +0200961/* Append an ASCII-encoded byte string.
Victor Stinner4a587072013-11-19 12:54:53 +0100962 Return 0 on success, raise an exception and return -1 on error. */
963PyAPI_FUNC(int)
964_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
965 const char *str, /* ASCII-encoded byte string */
966 Py_ssize_t len /* number of bytes, or -1 if unknown */
967 );
968
Victor Stinnere215d962012-10-06 23:03:36 +0200969/* Append a latin1-encoded byte string.
970 Return 0 on success, raise an exception and return -1 on error. */
971PyAPI_FUNC(int)
Victor Stinner4a587072013-11-19 12:54:53 +0100972_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
973 const char *str, /* latin1-encoded byte string */
974 Py_ssize_t len /* length in bytes */
Victor Stinnere215d962012-10-06 23:03:36 +0200975 );
976
Martin Panter6245cb32016-04-15 02:14:19 +0000977/* Get the value of the writer as a Unicode string. Clear the
Victor Stinnere215d962012-10-06 23:03:36 +0200978 buffer of the writer. Raise an exception and return NULL
979 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200980PyAPI_FUNC(PyObject *)
981_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
982
Victor Stinnere215d962012-10-06 23:03:36 +0200983/* Deallocate memory of a writer (clear its internal buffer). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200984PyAPI_FUNC(void)
985_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
986#endif
987
988#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000989/* Format the object based on the format_spec, as defined in PEP 3101
990 (Advanced String Formatting). */
Victor Stinnerd3f08822012-05-29 12:57:52 +0200991PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
992 _PyUnicodeWriter *writer,
993 PyObject *obj,
994 PyObject *format_spec,
995 Py_ssize_t start,
996 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000997#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000998
Walter Dörwald16807132007-05-25 13:52:07 +0000999PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1000PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001001PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1002 const char *u /* UTF-8 encoded string */
1003 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001004#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +00001005PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001006#endif
Walter Dörwald16807132007-05-25 13:52:07 +00001007
1008/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009#define PyUnicode_CHECK_INTERNED(op) \
1010 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +00001011
Guido van Rossumd8225182000-03-10 22:33:05 +00001012/* --- wchar_t support for platforms which support it --------------------- */
1013
1014#ifdef HAVE_WCHAR_H
1015
Georg Brandl952867a2010-06-27 10:17:12 +00001016/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +00001017 size.
1018
1019 The buffer is copied into the new object. */
1020
Mark Hammond91a681d2002-08-12 07:21:58 +00001021PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001022 const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 );
1025
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001026/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +00001027 most size wchar_t characters are copied.
1028
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001029 Note that the resulting wchar_t string may or may not be
1030 0-terminated. It is the responsibility of the caller to make sure
1031 that the wchar_t string is 0-terminated in case this is required by
1032 the application.
1033
1034 Returns the number of wchar_t characters copied (excluding a
1035 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 error. */
1037
Martin v. Löwis18e16552006-02-15 17:27:45 +00001038PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001039 PyObject *unicode, /* Unicode object */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001040 wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001042 );
1043
Victor Stinner137c34c2010-09-29 10:25:54 +00001044/* Convert the Unicode object to a wide character string. The output string
1045 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +02001046 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +00001047
Victor Stinner22fabe22015-02-11 18:17:56 +01001048 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
Victor Stinner137c34c2010-09-29 10:25:54 +00001049 on success. On error, returns NULL, *size is undefined and raises a
1050 MemoryError. */
1051
1052PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001053 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +00001054 Py_ssize_t *size /* number of characters of the result */
1055 );
1056
Victor Stinner9f789e72011-10-01 03:57:28 +02001057#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
Victor Stinner9f789e72011-10-01 03:57:28 +02001059#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060
Guido van Rossumd8225182000-03-10 22:33:05 +00001061#endif
1062
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001063/* --- Unicode ordinals --------------------------------------------------- */
1064
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065/* Create a Unicode Object from the given Unicode code point ordinal.
1066
Ezio Melottie7f90372012-10-05 03:33:31 +03001067 The ordinal must be in range(0x110000). A ValueError is
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 raised in case it is not.
1069
1070*/
1071
Marc-André Lemburg9c329de2002-08-12 08:19:10 +00001072PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001073
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001074/* --- Free-list management ----------------------------------------------- */
1075
1076/* Clear the free list used by the Unicode implementation.
1077
1078 This can be used to release memory used for objects on the free
1079 list back to the Python memory allocator.
1080
1081*/
1082
1083PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1084
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +00001086
1087 Many of these APIs take two arguments encoding and errors. These
1088 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001089 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +00001090
Georg Brandl952867a2010-06-27 10:17:12 +00001091 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +00001092
1093 Error handling is set by errors which may also be set to NULL
1094 meaning to use the default handling defined for the codec. Default
1095 error handling for all builtin codecs is "strict" (ValueErrors are
1096 raised).
1097
1098 The codecs all use a similar interface. Only deviation from the
1099 generic ones are documented.
1100
1101*/
1102
Fred Drakecb093fe2000-05-09 19:51:53 +00001103/* --- Manage the default encoding ---------------------------------------- */
1104
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001105/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001106 Unicode object unicode and the size of the encoded representation
1107 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +00001108
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001109 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001110
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001111 This function caches the UTF-8 encoded string in the unicodeobject
1112 and subsequent calls will return the same string. The memory is released
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 when the unicodeobject is deallocated.
1114
1115 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1116 support the previous internal function with the same behaviour.
1117
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001118 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001119 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001120
1121 *** If you need to access the Unicode object as UTF-8 bytes string,
1122 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +00001123*/
1124
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001125#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001127 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001128 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001130#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001131
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001132/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001133 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1136 in the unicodeobject.
1137
1138 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1139 support the previous internal function with the same behaviour.
1140
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001141 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001142 extracted from the returned data.
1143
1144 *** This API is for interpreter INTERNAL USE ONLY and will likely
1145 *** be removed or changed for Python 3.1.
1146
1147 *** If you need to access the Unicode object as UTF-8 bytes string,
1148 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001149
Marc-André Lemburg9155aa72008-04-29 11:14:08 +00001150*/
Martin v. Löwis5b222132007-06-10 09:51:05 +00001151
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001152#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1154#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001155#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +00001156
Alexander Belopolsky83283c22010-11-16 14:29:01 +00001157/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +00001158
Mark Hammond91a681d2002-08-12 07:21:58 +00001159PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +00001160
Guido van Rossumd8225182000-03-10 22:33:05 +00001161/* --- Generic Codecs ----------------------------------------------------- */
1162
1163/* Create a Unicode object by decoding the encoded string s of the
1164 given size. */
1165
Mark Hammond91a681d2002-08-12 07:21:58 +00001166PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001167 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001168 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +00001169 const char *encoding, /* encoding */
1170 const char *errors /* error handling */
1171 );
1172
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001173/* Decode a Unicode object unicode and return the result as Python
1174 object. */
1175
1176PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 PyObject *unicode, /* Unicode object */
1178 const char *encoding, /* encoding */
1179 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001180 );
1181
1182/* Decode a Unicode object unicode and return the result as Unicode
1183 object. */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 PyObject *unicode, /* Unicode object */
1187 const char *encoding, /* encoding */
1188 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 );
1190
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001191/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +00001192 Python string object. */
1193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001194#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001195PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +00001196 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001197 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001198 const char *encoding, /* encoding */
1199 const char *errors /* error handling */
1200 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001201#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001202
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001203/* Encodes a Unicode object and returns the result as Python
1204 object. */
1205
1206PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 PyObject *unicode, /* Unicode object */
1208 const char *encoding, /* encoding */
1209 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001210 );
1211
Guido van Rossumd8225182000-03-10 22:33:05 +00001212/* Encodes a Unicode object and returns the result as Python string
1213 object. */
1214
Mark Hammond91a681d2002-08-12 07:21:58 +00001215PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001216 PyObject *unicode, /* Unicode object */
1217 const char *encoding, /* encoding */
1218 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001219 );
1220
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001221/* Encodes a Unicode object and returns the result as Unicode
1222 object. */
1223
1224PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225 PyObject *unicode, /* Unicode object */
1226 const char *encoding, /* encoding */
1227 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001228 );
1229
1230/* Build an encoding map. */
1231
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00001232PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1233 PyObject* string /* 256 character map */
1234 );
1235
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001236/* --- UTF-7 Codecs ------------------------------------------------------- */
1237
Mark Hammond91a681d2002-08-12 07:21:58 +00001238PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 const char *string, /* UTF-7 encoded string */
1240 Py_ssize_t length, /* size of string */
1241 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001242 );
1243
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001244PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 const char *string, /* UTF-7 encoded string */
1246 Py_ssize_t length, /* size of string */
1247 const char *errors, /* error handling */
1248 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001249 );
1250
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001251#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001252PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 const Py_UNICODE *data, /* Unicode char buffer */
1254 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1255 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1256 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1257 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001258 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001259PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1260 PyObject *unicode, /* Unicode object */
1261 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1262 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1263 const char *errors /* error handling */
1264 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001265#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001266
Guido van Rossumd8225182000-03-10 22:33:05 +00001267/* --- UTF-8 Codecs ------------------------------------------------------- */
1268
Mark Hammond91a681d2002-08-12 07:21:58 +00001269PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 const char *string, /* UTF-8 encoded string */
1271 Py_ssize_t length, /* size of string */
1272 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001273 );
1274
Walter Dörwald69652032004-09-07 20:24:22 +00001275PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 const char *string, /* UTF-8 encoded string */
1277 Py_ssize_t length, /* size of string */
1278 const char *errors, /* error handling */
1279 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001280 );
1281
Mark Hammond91a681d2002-08-12 07:21:58 +00001282PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 );
1285
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001286#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1288 PyObject *unicode,
1289 const char *errors);
1290
Mark Hammond91a681d2002-08-12 07:21:58 +00001291PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 const Py_UNICODE *data, /* Unicode char buffer */
1293 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1294 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001296#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001297
Walter Dörwald41980ca2007-08-16 21:55:45 +00001298/* --- UTF-32 Codecs ------------------------------------------------------ */
1299
1300/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1301 the corresponding Unicode object.
1302
1303 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001305
1306 If byteorder is non-NULL, the decoder starts decoding using the
1307 given byte order:
1308
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 *byteorder == -1: little endian
1310 *byteorder == 0: native order
1311 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001312
1313 In native mode, the first four bytes of the stream are checked for a
1314 BOM mark. If found, the BOM mark is analysed, the byte order
1315 adjusted and the BOM skipped. In the other modes, no BOM mark
1316 interpretation is done. After completion, *byteorder is set to the
1317 current byte order at the end of input data.
1318
1319 If byteorder is NULL, the codec starts in native order mode.
1320
1321*/
1322
1323PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324 const char *string, /* UTF-32 encoded string */
1325 Py_ssize_t length, /* size of string */
1326 const char *errors, /* error handling */
1327 int *byteorder /* pointer to byteorder to use
1328 0=native;-1=LE,1=BE; updated on
1329 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001330 );
1331
1332PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 const char *string, /* UTF-32 encoded string */
1334 Py_ssize_t length, /* size of string */
1335 const char *errors, /* error handling */
1336 int *byteorder, /* pointer to byteorder to use
1337 0=native;-1=LE,1=BE; updated on
1338 exit */
1339 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001340 );
1341
1342/* Returns a Python string using the UTF-32 encoding in native byte
1343 order. The string always starts with a BOM mark. */
1344
1345PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001347 );
1348
1349/* Returns a Python string object holding the UTF-32 encoded value of
1350 the Unicode data.
1351
1352 If byteorder is not 0, output is written according to the following
1353 byte order:
1354
1355 byteorder == -1: little endian
1356 byteorder == 0: native byte order (writes a BOM mark)
1357 byteorder == 1: big endian
1358
1359 If byteorder is 0, the output string will always start with the
1360 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1361 prepended.
1362
1363*/
1364
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001365#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001366PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 const Py_UNICODE *data, /* Unicode char buffer */
1368 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1369 const char *errors, /* error handling */
1370 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001371 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001372PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1373 PyObject *object, /* Unicode object */
1374 const char *errors, /* error handling */
1375 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1376 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001377#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001378
Guido van Rossumd8225182000-03-10 22:33:05 +00001379/* --- UTF-16 Codecs ------------------------------------------------------ */
1380
Guido van Rossum9e896b32000-04-05 20:11:21 +00001381/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001382 the corresponding Unicode object.
1383
1384 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001386
1387 If byteorder is non-NULL, the decoder starts decoding using the
1388 given byte order:
1389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001390 *byteorder == -1: little endian
1391 *byteorder == 0: native order
1392 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001393
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001394 In native mode, the first two bytes of the stream are checked for a
1395 BOM mark. If found, the BOM mark is analysed, the byte order
1396 adjusted and the BOM skipped. In the other modes, no BOM mark
1397 interpretation is done. After completion, *byteorder is set to the
1398 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001399
1400 If byteorder is NULL, the codec starts in native order mode.
1401
1402*/
1403
Mark Hammond91a681d2002-08-12 07:21:58 +00001404PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001405 const char *string, /* UTF-16 encoded string */
1406 Py_ssize_t length, /* size of string */
1407 const char *errors, /* error handling */
1408 int *byteorder /* pointer to byteorder to use
1409 0=native;-1=LE,1=BE; updated on
1410 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001411 );
1412
Walter Dörwald69652032004-09-07 20:24:22 +00001413PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 const char *string, /* UTF-16 encoded string */
1415 Py_ssize_t length, /* size of string */
1416 const char *errors, /* error handling */
1417 int *byteorder, /* pointer to byteorder to use
1418 0=native;-1=LE,1=BE; updated on
1419 exit */
1420 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001421 );
1422
Guido van Rossumd8225182000-03-10 22:33:05 +00001423/* Returns a Python string using the UTF-16 encoding in native byte
1424 order. The string always starts with a BOM mark. */
1425
Mark Hammond91a681d2002-08-12 07:21:58 +00001426PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001428 );
1429
1430/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001431 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001432
1433 If byteorder is not 0, output is written according to the following
1434 byte order:
1435
1436 byteorder == -1: little endian
1437 byteorder == 0: native byte order (writes a BOM mark)
1438 byteorder == 1: big endian
1439
1440 If byteorder is 0, the output string will always start with the
1441 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1442 prepended.
1443
1444 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1445 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001446 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001447
1448*/
1449
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001450#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001451PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 const Py_UNICODE *data, /* Unicode char buffer */
1453 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1454 const char *errors, /* error handling */
1455 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001456 );
Martin v. Löwis1db7c132011-11-10 18:24:32 +01001457PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1458 PyObject* unicode, /* Unicode object */
1459 const char *errors, /* error handling */
1460 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1461 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001462#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001463
1464/* --- Unicode-Escape Codecs ---------------------------------------------- */
1465
Mark Hammond91a681d2002-08-12 07:21:58 +00001466PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001467 const char *string, /* Unicode-Escape encoded string */
1468 Py_ssize_t length, /* size of string */
1469 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001470 );
1471
Mark Hammond91a681d2002-08-12 07:21:58 +00001472PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001473 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001474 );
1475
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001476#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001477PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 const Py_UNICODE *data, /* Unicode char buffer */
1479 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001480 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001481#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001482
1483/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1484
Mark Hammond91a681d2002-08-12 07:21:58 +00001485PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 const char *string, /* Raw-Unicode-Escape encoded string */
1487 Py_ssize_t length, /* size of string */
1488 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001489 );
1490
Mark Hammond91a681d2002-08-12 07:21:58 +00001491PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001492 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001493 );
1494
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001495#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001496PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 const Py_UNICODE *data, /* Unicode char buffer */
1498 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001499 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001500#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001501
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001502/* --- Unicode Internal Codec ---------------------------------------------
1503
1504 Only for internal use in _codecsmodule.c */
1505
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001506#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001507PyObject *_PyUnicode_DecodeUnicodeInternal(
1508 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001509 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001510 const char *errors
1511 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001512#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001513
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001514/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001515
1516 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1517
1518*/
1519
Mark Hammond91a681d2002-08-12 07:21:58 +00001520PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001521 const char *string, /* Latin-1 encoded string */
1522 Py_ssize_t length, /* size of string */
1523 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001524 );
1525
Mark Hammond91a681d2002-08-12 07:21:58 +00001526PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001528 );
1529
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001530#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1532 PyObject* unicode,
1533 const char* errors);
1534
Mark Hammond91a681d2002-08-12 07:21:58 +00001535PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001536 const Py_UNICODE *data, /* Unicode char buffer */
1537 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1538 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001539 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001540#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001541
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001543
1544 Only 7-bit ASCII data is excepted. All other codes generate errors.
1545
1546*/
1547
Mark Hammond91a681d2002-08-12 07:21:58 +00001548PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 const char *string, /* ASCII encoded string */
1550 Py_ssize_t length, /* size of string */
1551 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001552 );
1553
Mark Hammond91a681d2002-08-12 07:21:58 +00001554PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001556 );
1557
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001558#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1560 PyObject* unicode,
1561 const char* errors);
1562
Mark Hammond91a681d2002-08-12 07:21:58 +00001563PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001564 const Py_UNICODE *data, /* Unicode char buffer */
1565 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1566 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001567 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001568#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001569
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001570/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001571
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001572 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001573
1574 Decoding mappings must map single string characters to single
1575 Unicode characters, integers (which are then interpreted as Unicode
1576 ordinals) or None (meaning "undefined mapping" and causing an
1577 error).
1578
1579 Encoding mappings must map single Unicode characters to single
1580 string characters, integers (which are then interpreted as Latin-1
1581 ordinals) or None (meaning "undefined mapping" and causing an
1582 error).
1583
1584 If a character lookup fails with a LookupError, the character is
1585 copied as-is meaning that its ordinal value will be interpreted as
1586 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1587 to contain those mappings which map characters to different code
1588 points.
1589
1590*/
1591
Mark Hammond91a681d2002-08-12 07:21:58 +00001592PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 const char *string, /* Encoded string */
1594 Py_ssize_t length, /* size of string */
1595 PyObject *mapping, /* character mapping
1596 (char ordinal -> unicode ordinal) */
1597 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001598 );
1599
Mark Hammond91a681d2002-08-12 07:21:58 +00001600PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001601 PyObject *unicode, /* Unicode object */
1602 PyObject *mapping /* character mapping
1603 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001604 );
1605
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001606#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001607PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001608 const Py_UNICODE *data, /* Unicode char buffer */
1609 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1610 PyObject *mapping, /* character mapping
1611 (unicode ordinal -> char ordinal) */
1612 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001613 );
Martin v. Löwis23e275b2011-11-02 18:02:51 +01001614PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1615 PyObject *unicode, /* Unicode object */
1616 PyObject *mapping, /* character mapping
1617 (unicode ordinal -> char ordinal) */
1618 const char *errors /* error handling */
1619 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001620#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001621
1622/* Translate a Py_UNICODE buffer of the given length by applying a
1623 character mapping table to it and return the resulting Unicode
1624 object.
1625
1626 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001627 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001628
1629 Mapping tables may be dictionaries or sequences. Unmapped character
1630 ordinals (ones which cause a LookupError) are left untouched and
1631 are copied as-is.
1632
1633*/
1634
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001635#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001636PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001637 const Py_UNICODE *data, /* Unicode char buffer */
1638 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1639 PyObject *table, /* Translate table */
1640 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001641 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001642#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001643
Steve Dowercc16be82016-09-08 10:35:16 -07001644#ifdef MS_WINDOWS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001645
Guido van Rossumefec1152000-03-28 02:01:15 +00001646/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001647
Mark Hammond91a681d2002-08-12 07:21:58 +00001648PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001649 const char *string, /* MBCS encoded string */
Steve Dowerf5aba582016-09-06 19:42:27 -07001650 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001651 const char *errors /* error handling */
1652 );
1653
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001654PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1655 const char *string, /* MBCS encoded string */
1656 Py_ssize_t length, /* size of string */
1657 const char *errors, /* error handling */
1658 Py_ssize_t *consumed /* bytes consumed */
1659 );
1660
Victor Stinner3a50e702011-10-18 21:21:00 +02001661PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1662 int code_page, /* code page number */
1663 const char *string, /* encoded string */
1664 Py_ssize_t length, /* size of string */
1665 const char *errors, /* error handling */
1666 Py_ssize_t *consumed /* bytes consumed */
1667 );
1668
Mark Hammond91a681d2002-08-12 07:21:58 +00001669PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001670 PyObject *unicode /* Unicode object */
1671 );
1672
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001673#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001674PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001675 const Py_UNICODE *data, /* Unicode char buffer */
Victor Stinner3a50e702011-10-18 21:21:00 +02001676 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001677 const char *errors /* error handling */
1678 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001679#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001680
Victor Stinner3a50e702011-10-18 21:21:00 +02001681PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1682 int code_page, /* code page number */
1683 PyObject *unicode, /* Unicode object */
1684 const char *errors /* error handling */
1685 );
1686
Steve Dowercc16be82016-09-08 10:35:16 -07001687#endif /* MS_WINDOWS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001688
Guido van Rossum9e896b32000-04-05 20:11:21 +00001689/* --- Decimal Encoder ---------------------------------------------------- */
1690
1691/* Takes a Unicode string holding a decimal value and writes it into
1692 an output buffer using standard ASCII digit codes.
1693
1694 The output buffer has to provide at least length+1 bytes of storage
1695 area. The output string is 0-terminated.
1696
1697 The encoder converts whitespace to ' ', decimal characters to their
1698 corresponding ASCII digit and all other Latin-1 characters except
1699 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1700 are treated as errors. This includes embedded NULL bytes.
1701
1702 Error handling is defined by the errors argument:
1703
1704 NULL or "strict": raise a ValueError
1705 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001706 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001707 "replace": replaces illegal characters with '?'
1708
1709 Returns 0 on success, -1 on failure.
1710
1711*/
1712
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001713#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001714PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001715 Py_UNICODE *s, /* Unicode buffer */
1716 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1717 char *output, /* Output buffer; must have size >= length */
1718 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001719 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001720#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001721
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001722/* Transforms code points that have decimal digit property to the
1723 corresponding ASCII digit code points.
1724
1725 Returns a new Unicode string on success, NULL on failure.
1726*/
1727
Georg Brandlb5503082010-12-05 11:40:48 +00001728#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001729PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1730 Py_UNICODE *s, /* Unicode buffer */
1731 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1732 );
Georg Brandlb5503082010-12-05 11:40:48 +00001733#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001734
Victor Stinner6f9568b2011-11-17 00:12:44 +01001735/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 as argument instead of a raw buffer and length. This function additionally
1737 transforms spaces to ASCII because this is what the callers in longobject,
1738 floatobject, and complexobject did anyways. */
1739
1740#ifndef Py_LIMITED_API
1741PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1742 PyObject *unicode /* Unicode object */
1743 );
1744#endif
1745
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001746/* --- Locale encoding --------------------------------------------------- */
1747
1748/* Decode a string from the current locale encoding. The decoder is strict if
1749 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1750 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1751 be decoded as a surrogate character and *surrogateescape* is not equal to
1752 zero, the byte sequence is escaped using the 'surrogateescape' error handler
1753 instead of being decoded. *str* must end with a null character but cannot
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001754 contain embedded null characters. */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001755
1756PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1757 const char *str,
1758 Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01001759 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001760
1761/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1762 length using strlen(). */
1763
1764PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1765 const char *str,
Victor Stinner1b579672011-12-17 05:47:23 +01001766 const char *errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01001767
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001768/* Encode a Unicode object to the current locale encoding. The encoder is
1769 strict is *surrogateescape* is equal to zero, otherwise the
1770 "surrogateescape" error handler is used. Return a bytes object. The string
Victor Stinnerd45c7f82012-12-04 01:34:47 +01001771 cannot contain embedded null characters. */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001772
1773PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1774 PyObject *unicode,
Victor Stinner1b579672011-12-17 05:47:23 +01001775 const char *errors
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01001776 );
1777
Martin v. Löwis011e8422009-05-05 04:43:17 +00001778/* --- File system encoding ---------------------------------------------- */
1779
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001780/* ParseTuple converter: encode str objects to bytes using
1781 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001782
1783PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1784
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001785/* ParseTuple converter: decode bytes objects to unicode using
1786 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1787
1788PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1789
Victor Stinner77c38622010-05-14 15:58:55 +00001790/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1791 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001792
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001793 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1794 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001795
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001796 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001797*/
1798
1799PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1800 const char *s /* encoded string */
1801 );
1802
Victor Stinner77c38622010-05-14 15:58:55 +00001803/* Decode a string using Py_FileSystemDefaultEncoding
1804 and the "surrogateescape" error handler.
1805
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001806 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1807 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001808*/
1809
Martin v. Löwis011e8422009-05-05 04:43:17 +00001810PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1811 const char *s, /* encoded string */
1812 Py_ssize_t size /* size */
1813 );
1814
Victor Stinnerae6265f2010-05-15 16:27:27 +00001815/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001816 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001817
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001818 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1819 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001820*/
1821
1822PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1823 PyObject *unicode
1824 );
1825
Guido van Rossumd8225182000-03-10 22:33:05 +00001826/* --- Methods & Slots ----------------------------------------------------
1827
1828 These are capable of handling Unicode objects and strings on input
1829 (we refer to them as strings in the descriptions) and return
Georg Brandlc6bc4c62011-10-05 16:23:09 +02001830 Unicode objects or integers as appropriate. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001831
1832/* Concat two strings giving a new Unicode string. */
1833
Mark Hammond91a681d2002-08-12 07:21:58 +00001834PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001835 PyObject *left, /* Left string */
1836 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001837 );
1838
Walter Dörwald1ab83302007-05-18 17:15:44 +00001839/* Concat two strings and put the result in *pleft
1840 (sets *pleft to NULL on error) */
1841
1842PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001843 PyObject **pleft, /* Pointer to left string */
1844 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001845 );
1846
1847/* Concat two strings, put the result in *pleft and drop the right object
1848 (sets *pleft to NULL on error) */
1849
1850PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001851 PyObject **pleft, /* Pointer to left string */
1852 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001853 );
1854
Guido van Rossumd8225182000-03-10 22:33:05 +00001855/* Split a string giving a list of Unicode strings.
1856
1857 If sep is NULL, splitting will be done at all whitespace
1858 substrings. Otherwise, splits occur at the given separator.
1859
1860 At most maxsplit splits will be done. If negative, no limit is set.
1861
1862 Separators are not included in the resulting list.
1863
1864*/
1865
Mark Hammond91a681d2002-08-12 07:21:58 +00001866PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001867 PyObject *s, /* String to split */
1868 PyObject *sep, /* String separator */
1869 Py_ssize_t maxsplit /* Maxsplit count */
1870 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001871
1872/* Dito, but split at line breaks.
1873
1874 CRLF is considered to be one line break. Line breaks are not
1875 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001876
Mark Hammond91a681d2002-08-12 07:21:58 +00001877PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001878 PyObject *s, /* String to split */
1879 int keepends /* If true, line end markers are included */
1880 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001881
Thomas Wouters477c8d52006-05-27 19:21:47 +00001882/* Partition a string using a given separator. */
1883
1884PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001885 PyObject *s, /* String to partition */
1886 PyObject *sep /* String separator */
1887 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001888
1889/* Partition a string using a given separator, searching from the end of the
1890 string. */
1891
1892PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001893 PyObject *s, /* String to partition */
1894 PyObject *sep /* String separator */
1895 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001896
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001897/* Split a string giving a list of Unicode strings.
1898
1899 If sep is NULL, splitting will be done at all whitespace
1900 substrings. Otherwise, splits occur at the given separator.
1901
1902 At most maxsplit splits will be done. But unlike PyUnicode_Split
1903 PyUnicode_RSplit splits from the end of the string. If negative,
1904 no limit is set.
1905
1906 Separators are not included in the resulting list.
1907
1908*/
1909
1910PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001911 PyObject *s, /* String to split */
1912 PyObject *sep, /* String separator */
1913 Py_ssize_t maxsplit /* Maxsplit count */
1914 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001915
Guido van Rossumd8225182000-03-10 22:33:05 +00001916/* Translate a string by applying a character mapping table to it and
1917 return the resulting Unicode object.
1918
1919 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001920 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001921
1922 Mapping tables may be dictionaries or sequences. Unmapped character
1923 ordinals (ones which cause a LookupError) are left untouched and
1924 are copied as-is.
1925
1926*/
1927
Mark Hammond91a681d2002-08-12 07:21:58 +00001928PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001929 PyObject *str, /* String */
1930 PyObject *table, /* Translate table */
1931 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001932 );
1933
1934/* Join a sequence of strings using the given separator and return
1935 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001936
Mark Hammond91a681d2002-08-12 07:21:58 +00001937PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001938 PyObject *separator, /* Separator string */
1939 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001940 );
1941
Serhiy Storchakaea525a22016-09-06 22:07:53 +03001942#ifndef Py_LIMITED_API
1943PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1944 PyObject *separator,
1945 PyObject **items,
1946 Py_ssize_t seqlen
1947 );
1948#endif /* Py_LIMITED_API */
1949
Guido van Rossumd8225182000-03-10 22:33:05 +00001950/* Return 1 if substr matches str[start:end] at the given tail end, 0
1951 otherwise. */
1952
Martin v. Löwis18e16552006-02-15 17:27:45 +00001953PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001954 PyObject *str, /* String */
1955 PyObject *substr, /* Prefix or Suffix string */
1956 Py_ssize_t start, /* Start index */
1957 Py_ssize_t end, /* Stop index */
1958 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001959 );
1960
1961/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001962 given search direction or -1 if not found. -2 is returned in case
1963 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001964
Martin v. Löwis18e16552006-02-15 17:27:45 +00001965PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001966 PyObject *str, /* String */
1967 PyObject *substr, /* Substring to find */
1968 Py_ssize_t start, /* Start index */
1969 Py_ssize_t end, /* Stop index */
1970 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001971 );
1972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973/* Like PyUnicode_Find, but search for single character only. */
1974PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1975 PyObject *str,
1976 Py_UCS4 ch,
1977 Py_ssize_t start,
1978 Py_ssize_t end,
1979 int direction
1980 );
1981
Barry Warsaw51ac5802000-03-20 16:36:48 +00001982/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001983
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001985 PyObject *str, /* String */
1986 PyObject *substr, /* Substring to count */
1987 Py_ssize_t start, /* Start index */
1988 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001989 );
1990
Barry Warsaw51ac5802000-03-20 16:36:48 +00001991/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001992 and return the resulting Unicode object. */
1993
Mark Hammond91a681d2002-08-12 07:21:58 +00001994PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001995 PyObject *str, /* String */
1996 PyObject *substr, /* Substring to find */
1997 PyObject *replstr, /* Substring to replace */
1998 Py_ssize_t maxcount /* Max. number of replacements to apply;
1999 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00002000 );
2001
2002/* Compare two strings and return -1, 0, 1 for less than, equal,
Victor Stinner90db9c42012-10-04 21:53:50 +02002003 greater than resp.
2004 Raise an exception and return -1 on error. */
Guido van Rossumd8225182000-03-10 22:33:05 +00002005
Mark Hammond91a681d2002-08-12 07:21:58 +00002006PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002007 PyObject *left, /* Left string */
2008 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00002009 );
2010
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002011#ifndef Py_LIMITED_API
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002012PyAPI_FUNC(int) _PyUnicode_CompareWithId(
2013 PyObject *left, /* Left string */
2014 _Py_Identifier *right /* Right identifier */
2015 );
Martin v. Löwis1c0689c2014-01-03 21:36:49 +01002016#endif
Victor Stinnerad14ccd2013-11-07 00:46:04 +01002017
Martin v. Löwis5b222132007-06-10 09:51:05 +00002018PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2019 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00002020 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00002021 );
2022
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002023/* Rich compare two strings and return one of the following:
2024
2025 - NULL in case an exception was raised
Martin Panter69332c12016-08-04 13:07:31 +00002026 - Py_True or Py_False for successful comparisons
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002027 - Py_NotImplemented in case the type combination is unknown
2028
2029 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
2030 case the conversion of the arguments to Unicode fails with a
2031 UnicodeDecodeError.
2032
2033 Possible values for op:
2034
2035 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2036
2037*/
2038
2039PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002040 PyObject *left, /* Left string */
2041 PyObject *right, /* Right string */
2042 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00002043 );
2044
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002045/* Apply an argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00002046 the resulting Unicode string. */
2047
Mark Hammond91a681d2002-08-12 07:21:58 +00002048PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002049 PyObject *format, /* Format string */
2050 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00002051 );
2052
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002053/* Checks whether element is contained in container and return 1/0
2054 accordingly.
2055
Martin Pantercc71a792016-04-05 06:19:42 +00002056 element has to coerce to a one element Unicode string. -1 is
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002057 returned in case of an error. */
2058
Mark Hammond91a681d2002-08-12 07:21:58 +00002059PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002060 PyObject *container, /* Container string */
2061 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00002062 );
2063
Martin v. Löwis47383402007-08-15 07:32:56 +00002064/* Checks whether argument is a valid identifier. */
2065
2066PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2067
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002068#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002069/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00002070PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002071 PyObject *self,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002072 int striptype,
2073 PyObject *sepobj
2074 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002075#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00002076
Eric Smitha3b1ac82009-04-03 14:45:06 +00002077/* Using explicit passed-in values, insert the thousands grouping
2078 into the string pointed to by buffer. For the argument descriptions,
2079 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002080#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02002082 PyObject *unicode,
Victor Stinner41a863c2012-02-24 00:37:51 +01002083 Py_ssize_t index,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 Py_ssize_t n_buffer,
2085 void *digits,
2086 Py_ssize_t n_digits,
2087 Py_ssize_t min_width,
2088 const char *grouping,
Victor Stinner41a863c2012-02-24 00:37:51 +01002089 PyObject *thousands_sep,
2090 Py_UCS4 *maxchar);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002091#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002092/* === Characters Type APIs =============================================== */
2093
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002094/* Helper array used by Py_UNICODE_ISSPACE(). */
2095
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002096#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00002097PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2098
Guido van Rossumd8225182000-03-10 22:33:05 +00002099/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00002100 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00002101
2102 These APIs are implemented in Objects/unicodectype.c.
2103
2104*/
2105
Mark Hammond91a681d2002-08-12 07:21:58 +00002106PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002107 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002108 );
2109
Mark Hammond91a681d2002-08-12 07:21:58 +00002110PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002111 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002112 );
2113
Mark Hammond91a681d2002-08-12 07:21:58 +00002114PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002115 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002116 );
2117
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002118PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002119 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002120 );
2121
2122PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002123 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00002124 );
2125
Mark Hammond91a681d2002-08-12 07:21:58 +00002126PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002127 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002128 );
2129
Mark Hammond91a681d2002-08-12 07:21:58 +00002130PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002131 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002132 );
2133
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002134PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2135 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002136 );
2137
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002138PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2139 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002140 );
2141
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2143 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002144 );
2145
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002146PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2147 Py_UCS4 ch, /* Unicode character */
2148 Py_UCS4 *res
2149 );
2150
2151PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2152 Py_UCS4 ch, /* Unicode character */
2153 Py_UCS4 *res
2154 );
2155
2156PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2157 Py_UCS4 ch, /* Unicode character */
2158 Py_UCS4 *res
2159 );
2160
Benjamin Petersond5890c82012-01-14 13:23:30 -05002161PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2162 Py_UCS4 ch, /* Unicode character */
2163 Py_UCS4 *res
2164 );
2165
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002166PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002167 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002168 );
2169
2170PyAPI_FUNC(int) _PyUnicode_IsCased(
Amaury Forgeot d'Arc77b1ecf2012-01-13 22:12:37 +01002171 Py_UCS4 ch /* Unicode character */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05002172 );
2173
Mark Hammond91a681d2002-08-12 07:21:58 +00002174PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002175 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002176 );
2177
Mark Hammond91a681d2002-08-12 07:21:58 +00002178PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002179 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002180 );
2181
Mark Hammond91a681d2002-08-12 07:21:58 +00002182PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002183 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002184 );
2185
Mark Hammond91a681d2002-08-12 07:21:58 +00002186PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002187 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002188 );
2189
Mark Hammond91a681d2002-08-12 07:21:58 +00002190PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002191 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002192 );
2193
Mark Hammond91a681d2002-08-12 07:21:58 +00002194PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002195 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00002196 );
2197
Georg Brandl559e5d72008-06-11 18:37:52 +00002198PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002199 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00002200 );
2201
Mark Hammond91a681d2002-08-12 07:21:58 +00002202PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00002203 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00002204 );
2205
Victor Stinneref8d95c2010-08-16 22:03:11 +00002206PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2207 const Py_UNICODE *u
2208 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002209
2210PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002211 Py_UNICODE *s1,
2212 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002213
Victor Stinnerc4eb7652010-09-01 23:43:50 +00002214PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2215 Py_UNICODE *s1, const Py_UNICODE *s2);
2216
Martin v. Löwis5b222132007-06-10 09:51:05 +00002217PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002218 Py_UNICODE *s1,
2219 const Py_UNICODE *s2,
2220 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002221
2222PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002223 const Py_UNICODE *s1,
2224 const Py_UNICODE *s2
2225 );
2226
2227PyAPI_FUNC(int) Py_UNICODE_strncmp(
2228 const Py_UNICODE *s1,
2229 const Py_UNICODE *s2,
2230 size_t n
2231 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00002232
2233PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002234 const Py_UNICODE *s,
2235 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00002236 );
2237
Victor Stinner331ea922010-08-10 16:37:20 +00002238PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00002239 const Py_UNICODE *s,
2240 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00002241 );
2242
Ethan Furmanb95b5612015-01-23 20:05:18 -08002243PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2244
Victor Stinner71133ff2010-09-01 23:43:53 +00002245/* Create a copy of a unicode string ending with a nul character. Return NULL
2246 and raise a MemoryError exception on memory allocation failure, otherwise
2247 return a new allocated buffer (use PyMem_Free() to free the buffer). */
2248
Victor Stinner46408602010-09-03 16:18:00 +00002249PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00002250 PyObject *unicode
2251 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002252#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00002253
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002254#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002255PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
Victor Stinner7931d9a2011-11-04 00:22:48 +01002256 PyObject *op,
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002257 int check_content);
2258#endif
2259
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002260#ifndef Py_LIMITED_API
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2262PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2263/* Clear all static strings. */
2264PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2265
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002266/* Fast equality check when the inputs are known to be exact unicode types
2267 and where the hash values are equal (i.e. a very probable match) */
2268PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
Serhiy Storchaka9fab79b2016-09-11 11:03:14 +03002269#endif /* !Py_LIMITED_API */
Raymond Hettingerac2ef652015-07-04 16:04:44 -07002270
Guido van Rossumd8225182000-03-10 22:33:05 +00002271#ifdef __cplusplus
2272}
2273#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00002274#endif /* !Py_UNICODEOBJECT_H */