blob: 99f54c3a42e75a2e52d0e1c85248c4373caa99a8 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
209 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200210 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000211 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212 struct {
213 /*
214 SSTATE_NOT_INTERNED (0)
215 SSTATE_INTERNED_MORTAL (1)
216 SSTATE_INTERNED_IMMORTAL (2)
217
218 If interned != SSTATE_NOT_INTERNED, the two references from the
219 dictionary to this object are *not* counted in ob_refcnt.
220 */
221 unsigned int interned:2;
222 /* Character size:
223
224 PyUnicode_WCHAR_KIND (0): wchar_t*
225 PyUnicode_1BYTE_KIND (1): Py_UCS1*
226 PyUnicode_2BYTE_KIND (2): Py_UCS2*
227 PyUnicode_4BYTE_KIND (3): Py_UCS4*
228 */
229 unsigned int kind:2;
230 /* Compact is with respect to the allocation scheme. Compact unicode
231 objects only require one memory block while non-compact objects use
232 one block for the PyUnicodeObject struct and another for its data
233 buffer. */
234 unsigned int compact:1;
235 /* Compact objects which are ASCII-only also have the state.compact
236 flag set, and use the PyASCIIObject struct. */
237 unsigned int ascii:1;
238 /* The ready flag indicates whether the object layout is initialized
239 completely. This means that this is either a compact object, or
240 the data pointer is filled out. The bit is redundant, and helps
241 to minimize the test in PyUnicode_IS_READY(). */
242 unsigned int ready:1;
243 } state;
244 wchar_t *wstr; /* wchar_t representation (null-terminated) */
245} PyASCIIObject;
246
247/* Non-ASCII strings allocated through PyUnicode_New use the
248 PyCompactUnicodeOject structure. state.compact is set, and the data
249 immediately follow the structure. */
250typedef struct {
251 PyASCIIObject _base;
252 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
253 * terminating \0. */
254 char *utf8; /* UTF-8 representation (null-terminated) */
255 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
256 * surrogates count as two code points. */
257} PyCompactUnicodeObject;
258
259/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
260 PyUnicodeObject structure. The actual string data is initially in the wstr
261 block, and copied into the data block using PyUnicode_Ready. */
262typedef struct {
263 PyCompactUnicodeObject _base;
264 union {
265 void *any;
266 Py_UCS1 *latin1;
267 Py_UCS2 *ucs2;
268 Py_UCS4 *ucs4;
269 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000270} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000271#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000272
Mark Hammond91a681d2002-08-12 07:21:58 +0000273PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000274PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000275
Thomas Wouters27d517b2007-02-25 20:39:11 +0000276#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000277 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
278#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000279
280/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000281#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282
283#define PyUnicode_WSTR_LENGTH(op) \
284 (((PyASCIIObject*)op)->state.ascii ? \
285 ((PyASCIIObject*)op)->length : \
286 ((PyCompactUnicodeObject*)op)->wstr_length)
287
288/* Returns the deprecated Py_UNICODE representation's size in code units
289 (this includes surrogate pairs as 2 units).
290 If the Py_UNICODE representation is not available, it will be computed
291 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
292
Guido van Rossumd8225182000-03-10 22:33:05 +0000293#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 (assert(PyUnicode_Check(op)), \
295 (((PyASCIIObject *)(op))->wstr) ? \
296 PyUnicode_WSTR_LENGTH(op) : \
297 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
298 PyUnicode_WSTR_LENGTH(op)))
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
302
303/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
304 representation on demand. Using this macro is very inefficient now,
305 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
306 use PyUnicode_WRITE() and PyUnicode_READ(). */
307
Guido van Rossumd8225182000-03-10 22:33:05 +0000308#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 (assert(PyUnicode_Check(op)), \
310 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
311 PyUnicode_AsUnicode((PyObject *)(op)))
312
Guido van Rossumd8225182000-03-10 22:33:05 +0000313#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 ((const char *)(PyUnicode_AS_UNICODE(op)))
315
316
317/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
318
319/* Values for PyUnicodeObject.state: */
320
321/* Interning state. */
322#define SSTATE_NOT_INTERNED 0
323#define SSTATE_INTERNED_MORTAL 1
324#define SSTATE_INTERNED_IMMORTAL 2
325
326#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
327
328/* String contains only wstr byte characters. This is only possible
329 when the string was created with a legacy API and PyUnicode_Ready()
330 has not been called yet. */
331#define PyUnicode_WCHAR_KIND 0
332
333/* Return values of the PyUnicode_KIND() macro: */
334
335#define PyUnicode_1BYTE_KIND 1
336#define PyUnicode_2BYTE_KIND 2
337#define PyUnicode_4BYTE_KIND 3
338
339
340/* Return the number of bytes the string uses to represent single characters,
341 this can be 1, 2 or 4. */
342#define PyUnicode_CHARACTER_SIZE(op) \
343 (1 << (PyUnicode_KIND(op) - 1))
344
345/* Return pointers to the canonical representation casted as unsigned char,
346 Py_UCS2, or Py_UCS4 for direct character access.
347 No checks are performed, use PyUnicode_CHARACTER_SIZE or
348 PyUnicode_KIND() before to ensure these will work correctly. */
349
350#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
351#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
352#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
353
354/* Return true if the string is compact or 0 if not.
355 No type checks or Ready calls are performed. */
356#define PyUnicode_IS_COMPACT(op) \
357 (((PyASCIIObject*)(op))->state.compact)
358
Victor Stinner157f83f2011-09-28 21:41:31 +0200359/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200360#define PyUnicode_KIND(op) \
361 (assert(PyUnicode_Check(op)), \
362 assert(PyUnicode_IS_READY(op)), \
363 ((PyASCIIObject *)(op))->state.kind)
364
Victor Stinner157f83f2011-09-28 21:41:31 +0200365/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366#define _PyUnicode_COMPACT_DATA(op) \
367 (PyUnicode_IS_COMPACT_ASCII(op) ? \
368 ((void*)((PyASCIIObject*)(op) + 1)) : \
369 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
370
371#define _PyUnicode_NONCOMPACT_DATA(op) \
372 (assert(((PyUnicodeObject*)(op))->data.any), \
373 ((((PyUnicodeObject *)(op))->data.any)))
374
375#define PyUnicode_DATA(op) \
376 (assert(PyUnicode_Check(op)), \
377 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
378 _PyUnicode_NONCOMPACT_DATA(op))
379
380#define _PyUnicode_UTF8(op) \
381 (PyUnicode_IS_COMPACT_ASCII(op) ? \
382 ((char*)((PyASCIIObject*)(op) + 1)) : \
383 ((PyCompactUnicodeObject*)(op))->utf8)
384
385#define _PyUnicode_UTF8_LENGTH(op) \
386 (PyUnicode_IS_COMPACT_ASCII(op) ? \
387 ((PyASCIIObject*)(op))->length : \
388 ((PyCompactUnicodeObject*)(op))->utf8_length)
389
390/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
391
392 The index is a character index, the result is a size in bytes. */
393#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
394
395/* In the access macros below, "kind" may be evaluated more than once.
396 All other macro parameters are evaluated exactly once, so it is safe
397 to put side effects into them (such as increasing the index). */
398
399/* Write into the canonical representation, this macro does not do any sanity
400 checks and is intended for usage in loops. The caller should cache the
401 kind and data pointers optained form other macro calls.
402 index is the index in the string (starts at 0) and value is the new
403 code point value which shoule be written to that location. */
404#define PyUnicode_WRITE(kind, data, index, value) \
405 do { \
406 switch ((kind)) { \
407 case PyUnicode_1BYTE_KIND: { \
408 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
409 break; \
410 } \
411 case PyUnicode_2BYTE_KIND: { \
412 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
413 break; \
414 } \
415 default: { \
416 assert((kind) == PyUnicode_4BYTE_KIND); \
417 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
418 } \
419 } \
420 } while (0)
421
422/* Read a code point form the string's canonical representation. No checks
423 or ready calls are performed. */
424#define PyUnicode_READ(kind, data, index) \
425 ((Py_UCS4) \
426 ((kind) == PyUnicode_1BYTE_KIND ? \
427 ((const unsigned char *)(data))[(index)] : \
428 ((kind) == PyUnicode_2BYTE_KIND ? \
429 ((const Py_UCS2 *)(data))[(index)] : \
430 ((const Py_UCS4 *)(data))[(index)] \
431 ) \
432 ))
433
434/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
435 calls PyUnicode_KIND() and might call it twice. For single reads, use
436 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
437 cache kind and use PyUnicode_READ instead. */
438#define PyUnicode_READ_CHAR(unicode, index) \
439 ((Py_UCS4) \
440 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
441 ((const unsigned char *)(PyUnicode_DATA((unicode))))[(index)] : \
442 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
443 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
444 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
445 ) \
446 ))
447
448/* Returns the length of the unicode string. The caller has to make sure that
449 the string has it's canonical representation set before calling
450 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
451#define PyUnicode_GET_LENGTH(op) \
452 (assert(PyUnicode_Check(op)), \
453 assert(PyUnicode_IS_READY(op)), \
454 ((PyASCIIObject *)(op))->length)
455
456
457/* Fast check to determine whether an object is ready. Equivalent to
458 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
459
460#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
461
462/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
463 case. If the canonical representation is not yet set, it will still call
464 PyUnicode_Ready().
465 Returns 0 on success and -1 on errors. */
466#define PyUnicode_READY(op) \
467 (assert(PyUnicode_Check(op)), \
468 (PyUnicode_IS_READY(op) ? \
469 0 : _PyUnicode_Ready((PyUnicodeObject *)(op))))
470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471/* Return a maximum character value which is suitable for creating another
472 string based on op. This is always an approximation but more efficient
473 than interating over the string. */
474#define PyUnicode_MAX_CHAR_VALUE(op) \
475 (assert(PyUnicode_IS_READY(op)), \
476 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
477 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
478 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
479 (0x7fU) : (0xffU) \
480 ) : \
481 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
482 (0xffffU) : (0x10ffffU) \
483 ))))
484
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000485#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000486
487/* --- Constants ---------------------------------------------------------- */
488
489/* This Unicode character will be used as replacement character during
490 decoding if the errors argument is set to "replace". Note: the
491 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
492 Unicode 3.0. */
493
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200494#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000495
496/* === Public API ========================================================= */
497
498/* --- Plain Py_UNICODE --------------------------------------------------- */
499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200500/* With PEP 393, this is the recommended way to allocate a new unicode object.
501 This function will allocate the object and its buffer in a single memory
502 block. Objects created using this function are not resizable. */
503#ifndef Py_LIMITED_API
504PyAPI_FUNC(PyObject*) PyUnicode_New(
505 Py_ssize_t size, /* Number of code points in the new string */
506 Py_UCS4 maxchar /* maximum code point value in the string */
507 );
508#endif
509
510/* Initializes the canonical string representation from a the deprected
511 wstr/Py_UNICODE representation. This function is used to convert
Victor Stinner157f83f2011-09-28 21:41:31 +0200512 unicode objects which were created using the old API to the new flexible
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513 format introduced with PEP 393. The PyUnicode_READY() macro can be
514 more efficient if the string is already ready. */
515#ifndef Py_LIMITED_API
516PyAPI_FUNC(int) _PyUnicode_Ready(
517 PyUnicodeObject *unicode /* Unicode object */
518 );
519#endif
520
521/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200522 character conversion when necessary and falls back to memcpy if possible.
523
524 Fail if 'to' is smaller than how_many or smaller than len(from)-from_start,
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200525 or if kind(from[from_start:from_start+how_many]) > kind(to), or if to has
526 more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200527
528 Return the number of written character, or return -1 and raise an exception
529 on error.
530
531 Pseudo-code:
532
533 how_many = min(how_many, len(from) - from_start)
534 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
535 return how_many
536 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200538PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539 PyObject *to,
540 Py_ssize_t to_start,
541 PyObject *from,
542 Py_ssize_t from_start,
543 Py_ssize_t how_many
544 );
545#endif
546
Guido van Rossumd8225182000-03-10 22:33:05 +0000547/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000548 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000549
550 u may be NULL which causes the contents to be undefined. It is the
551 user's responsibility to fill in the needed data afterwards. Note
552 that modifying the Unicode object contents after construction is
553 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000554
555 The buffer is copied into the new object. */
556
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000557#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000558PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000559 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000560 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000561 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000562#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000563
Georg Brandl952867a2010-06-27 10:17:12 +0000564/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000565PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000566 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000567 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000568 );
569
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000570/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200571 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000573 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000574 );
575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200576#ifndef Py_LIMITED_API
577PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
578 int kind,
579 const void *buffer,
580 Py_ssize_t size);
581#endif
582
583PyAPI_FUNC(PyObject*) PyUnicode_Substring(
584 PyObject *str,
585 Py_ssize_t start,
586 Py_ssize_t end);
587
588/* Copy the string into a UCS4 buffer including the null character is copy_null
589 is set. Return NULL and raise an exception on error. Raise a ValueError if
590 the buffer is smaller than the string. Return buffer on success.
591
592 buflen is the length of the buffer in (Py_UCS4) characters. */
593PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
594 PyObject *unicode,
595 Py_UCS4* buffer,
596 Py_ssize_t buflen,
597 int copy_null);
598
599/* Copy the string into a UCS4 buffer. A new buffer is allocated using
600 * PyMem_Malloc; if this fails, NULL is returned with a memory error
601 exception set. */
602PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
603
Guido van Rossumd8225182000-03-10 22:33:05 +0000604/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605 Py_UNICODE buffer.
606 If the wchar_t/Py_UNICODE representation is not yet available, this
607 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000608
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000609#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000610PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000611 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000612 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000613#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615/* Return a read-only pointer to the Unicode object's internal
616 Py_UNICODE buffer and save the length at size.
617 If the wchar_t/Py_UNICODE representation is not yet available, this
618 function will calculate it. */
619
620#ifndef Py_LIMITED_API
621PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
622 PyObject *unicode, /* Unicode object */
623 Py_ssize_t *size /* location where to save the length */
624 );
625#endif
626
Guido van Rossumd8225182000-03-10 22:33:05 +0000627/* Get the length of the Unicode object. */
628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
630 PyObject *unicode
631);
632
Victor Stinner157f83f2011-09-28 21:41:31 +0200633/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 string representation. */
635
Martin v. Löwis18e16552006-02-15 17:27:45 +0000636PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000638 );
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640/* Read a character from the string. */
641
642PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
643 PyObject *unicode,
644 Py_ssize_t index
645 );
646
647/* Write a character to the string. The string must have been created through
648 PyUnicode_New, must not be shared, and must not have been hashed yet. */
649
650PyAPI_FUNC(int) PyUnicode_WriteChar(
651 PyObject *unicode,
652 Py_ssize_t index,
653 Py_UCS4 character
654 );
655
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000656#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000657/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000658PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000659#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000660
Guido van Rossum52c23592000-04-10 13:41:41 +0000661/* Resize an already allocated Unicode object to the new size length.
662
663 *unicode is modified to point to the new (resized) object and 0
664 returned on success.
665
666 This API may only be called by the function which also called the
667 Unicode constructor. The refcount on the object must be 1. Otherwise,
668 an error is returned.
669
670 Error handling is implemented as follows: an exception is set, -1
671 is returned and *unicode left untouched.
672
673*/
674
Mark Hammond91a681d2002-08-12 07:21:58 +0000675PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 PyObject **unicode, /* Pointer to the Unicode object */
677 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000678 );
679
Guido van Rossumd8225182000-03-10 22:33:05 +0000680/* Coerce obj to an Unicode object and return a reference with
681 *incremented* refcount.
682
683 Coercion is done in the following way:
684
Georg Brandl952867a2010-06-27 10:17:12 +0000685 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000686 under the assumptions that they contain data using the UTF-8
687 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000688
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000689 2. All other objects (including Unicode objects) raise an
690 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000691
692 The API returns NULL in case of an error. The caller is responsible
693 for decref'ing the returned objects.
694
695*/
696
Mark Hammond91a681d2002-08-12 07:21:58 +0000697PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000699 const char *encoding, /* encoding */
700 const char *errors /* error handling */
701 );
702
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000703/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000704 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000706 Unicode objects are passed back as-is (subclasses are converted to
707 true Unicode objects), all other objects are delegated to
708 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000709 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000710
711 The API returns NULL in case of an error. The caller is responsible
712 for decref'ing the returned objects.
713
714*/
715
Mark Hammond91a681d2002-08-12 07:21:58 +0000716PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000717 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000718 );
719
Victor Stinner1205f272010-09-11 00:54:47 +0000720PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
721 const char *format, /* ASCII-encoded string */
722 va_list vargs
723 );
724PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
725 const char *format, /* ASCII-encoded string */
726 ...
727 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000729#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000730/* Format the object based on the format_spec, as defined in PEP 3101
731 (Advanced String Formatting). */
732PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733 PyObject *format_spec,
734 Py_ssize_t start,
735 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000736#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000737
Walter Dörwald16807132007-05-25 13:52:07 +0000738PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
739PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000740PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
741 const char *u /* UTF-8 encoded string */
742 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000743#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000744PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000745#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000746
747/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748#define PyUnicode_CHECK_INTERNED(op) \
749 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000750
Guido van Rossumd8225182000-03-10 22:33:05 +0000751/* --- wchar_t support for platforms which support it --------------------- */
752
753#ifdef HAVE_WCHAR_H
754
Georg Brandl952867a2010-06-27 10:17:12 +0000755/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000756 size.
757
758 The buffer is copied into the new object. */
759
Mark Hammond91a681d2002-08-12 07:21:58 +0000760PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000761 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000762 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000763 );
764
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000765/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000766 most size wchar_t characters are copied.
767
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000768 Note that the resulting wchar_t string may or may not be
769 0-terminated. It is the responsibility of the caller to make sure
770 that the wchar_t string is 0-terminated in case this is required by
771 the application.
772
773 Returns the number of wchar_t characters copied (excluding a
774 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000775 error. */
776
Martin v. Löwis18e16552006-02-15 17:27:45 +0000777PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000778 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000779 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000780 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000781 );
782
Victor Stinner137c34c2010-09-29 10:25:54 +0000783/* Convert the Unicode object to a wide character string. The output string
784 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200785 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000786
787 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
788 on success. On error, returns NULL, *size is undefined and raises a
789 MemoryError. */
790
791PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000792 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000793 Py_ssize_t *size /* number of characters of the result */
794 );
795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
797
Guido van Rossumd8225182000-03-10 22:33:05 +0000798#endif
799
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000800/* --- Unicode ordinals --------------------------------------------------- */
801
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802/* Create a Unicode Object from the given Unicode code point ordinal.
803
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000804 The ordinal must be in range(0x10000) on narrow Python builds
805 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
806 raised in case it is not.
807
808*/
809
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000810PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000811
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000812/* --- Free-list management ----------------------------------------------- */
813
814/* Clear the free list used by the Unicode implementation.
815
816 This can be used to release memory used for objects on the free
817 list back to the Python memory allocator.
818
819*/
820
821PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
822
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000824
825 Many of these APIs take two arguments encoding and errors. These
826 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000827 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000828
Georg Brandl952867a2010-06-27 10:17:12 +0000829 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000830
831 Error handling is set by errors which may also be set to NULL
832 meaning to use the default handling defined for the codec. Default
833 error handling for all builtin codecs is "strict" (ValueErrors are
834 raised).
835
836 The codecs all use a similar interface. Only deviation from the
837 generic ones are documented.
838
839*/
840
Fred Drakecb093fe2000-05-09 19:51:53 +0000841/* --- Manage the default encoding ---------------------------------------- */
842
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000843/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000844 Unicode object unicode and the size of the encoded representation
845 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000846
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000847 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000848
Victor Stinner157f83f2011-09-28 21:41:31 +0200849 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 and subsequent calls will return the same string. The memory is relased
851 when the unicodeobject is deallocated.
852
853 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
854 support the previous internal function with the same behaviour.
855
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000856 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000857 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000858
859 *** If you need to access the Unicode object as UTF-8 bytes string,
860 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000861*/
862
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000863#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000866 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200867#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000868#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000869
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000870/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000871 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
874 in the unicodeobject.
875
876 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
877 support the previous internal function with the same behaviour.
878
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000879 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000880 extracted from the returned data.
881
882 *** This API is for interpreter INTERNAL USE ONLY and will likely
883 *** be removed or changed for Python 3.1.
884
885 *** If you need to access the Unicode object as UTF-8 bytes string,
886 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000887
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000888*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000889
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000890#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
892#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000893#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000894
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000895/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000896
Mark Hammond91a681d2002-08-12 07:21:58 +0000897PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000898
Guido van Rossumd8225182000-03-10 22:33:05 +0000899/* --- Generic Codecs ----------------------------------------------------- */
900
901/* Create a Unicode object by decoding the encoded string s of the
902 given size. */
903
Mark Hammond91a681d2002-08-12 07:21:58 +0000904PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000905 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000906 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000907 const char *encoding, /* encoding */
908 const char *errors /* error handling */
909 );
910
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000911/* Decode a Unicode object unicode and return the result as Python
912 object. */
913
914PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 PyObject *unicode, /* Unicode object */
916 const char *encoding, /* encoding */
917 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000918 );
919
920/* Decode a Unicode object unicode and return the result as Unicode
921 object. */
922
923PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 PyObject *unicode, /* Unicode object */
925 const char *encoding, /* encoding */
926 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000927 );
928
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000929/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000930 Python string object. */
931
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000932#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000933PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000934 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000935 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000936 const char *encoding, /* encoding */
937 const char *errors /* error handling */
938 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000939#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000940
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000941/* Encodes a Unicode object and returns the result as Python
942 object. */
943
944PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000945 PyObject *unicode, /* Unicode object */
946 const char *encoding, /* encoding */
947 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000948 );
949
Guido van Rossumd8225182000-03-10 22:33:05 +0000950/* Encodes a Unicode object and returns the result as Python string
951 object. */
952
Mark Hammond91a681d2002-08-12 07:21:58 +0000953PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000954 PyObject *unicode, /* Unicode object */
955 const char *encoding, /* encoding */
956 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000957 );
958
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000959/* Encodes a Unicode object and returns the result as Unicode
960 object. */
961
962PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 PyObject *unicode, /* Unicode object */
964 const char *encoding, /* encoding */
965 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000966 );
967
968/* Build an encoding map. */
969
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000970PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
971 PyObject* string /* 256 character map */
972 );
973
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974/* --- UTF-7 Codecs ------------------------------------------------------- */
975
Mark Hammond91a681d2002-08-12 07:21:58 +0000976PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 const char *string, /* UTF-7 encoded string */
978 Py_ssize_t length, /* size of string */
979 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 );
981
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000982PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000983 const char *string, /* UTF-7 encoded string */
984 Py_ssize_t length, /* size of string */
985 const char *errors, /* error handling */
986 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000987 );
988
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000989#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000990PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 const Py_UNICODE *data, /* Unicode char buffer */
992 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
993 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
994 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
995 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000997#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998
Guido van Rossumd8225182000-03-10 22:33:05 +0000999/* --- UTF-8 Codecs ------------------------------------------------------- */
1000
Mark Hammond91a681d2002-08-12 07:21:58 +00001001PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001002 const char *string, /* UTF-8 encoded string */
1003 Py_ssize_t length, /* size of string */
1004 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001005 );
1006
Walter Dörwald69652032004-09-07 20:24:22 +00001007PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 const char *string, /* UTF-8 encoded string */
1009 Py_ssize_t length, /* size of string */
1010 const char *errors, /* error handling */
1011 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001012 );
1013
Mark Hammond91a681d2002-08-12 07:21:58 +00001014PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001015 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 );
1017
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001018#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1020 PyObject *unicode,
1021 const char *errors);
1022
Mark Hammond91a681d2002-08-12 07:21:58 +00001023PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001024 const Py_UNICODE *data, /* Unicode char buffer */
1025 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1026 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001027 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001028#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001029
Walter Dörwald41980ca2007-08-16 21:55:45 +00001030/* --- UTF-32 Codecs ------------------------------------------------------ */
1031
1032/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1033 the corresponding Unicode object.
1034
1035 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001036 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001037
1038 If byteorder is non-NULL, the decoder starts decoding using the
1039 given byte order:
1040
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 *byteorder == -1: little endian
1042 *byteorder == 0: native order
1043 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001044
1045 In native mode, the first four bytes of the stream are checked for a
1046 BOM mark. If found, the BOM mark is analysed, the byte order
1047 adjusted and the BOM skipped. In the other modes, no BOM mark
1048 interpretation is done. After completion, *byteorder is set to the
1049 current byte order at the end of input data.
1050
1051 If byteorder is NULL, the codec starts in native order mode.
1052
1053*/
1054
1055PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 const char *string, /* UTF-32 encoded string */
1057 Py_ssize_t length, /* size of string */
1058 const char *errors, /* error handling */
1059 int *byteorder /* pointer to byteorder to use
1060 0=native;-1=LE,1=BE; updated on
1061 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001062 );
1063
1064PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 const char *string, /* UTF-32 encoded string */
1066 Py_ssize_t length, /* size of string */
1067 const char *errors, /* error handling */
1068 int *byteorder, /* pointer to byteorder to use
1069 0=native;-1=LE,1=BE; updated on
1070 exit */
1071 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001072 );
1073
1074/* Returns a Python string using the UTF-32 encoding in native byte
1075 order. The string always starts with a BOM mark. */
1076
1077PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001079 );
1080
1081/* Returns a Python string object holding the UTF-32 encoded value of
1082 the Unicode data.
1083
1084 If byteorder is not 0, output is written according to the following
1085 byte order:
1086
1087 byteorder == -1: little endian
1088 byteorder == 0: native byte order (writes a BOM mark)
1089 byteorder == 1: big endian
1090
1091 If byteorder is 0, the output string will always start with the
1092 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1093 prepended.
1094
1095*/
1096
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001097#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001098PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099 const Py_UNICODE *data, /* Unicode char buffer */
1100 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1101 const char *errors, /* error handling */
1102 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001103 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001104#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001105
Guido van Rossumd8225182000-03-10 22:33:05 +00001106/* --- UTF-16 Codecs ------------------------------------------------------ */
1107
Guido van Rossum9e896b32000-04-05 20:11:21 +00001108/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001109 the corresponding Unicode object.
1110
1111 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001113
1114 If byteorder is non-NULL, the decoder starts decoding using the
1115 given byte order:
1116
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 *byteorder == -1: little endian
1118 *byteorder == 0: native order
1119 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001120
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001121 In native mode, the first two bytes of the stream are checked for a
1122 BOM mark. If found, the BOM mark is analysed, the byte order
1123 adjusted and the BOM skipped. In the other modes, no BOM mark
1124 interpretation is done. After completion, *byteorder is set to the
1125 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001126
1127 If byteorder is NULL, the codec starts in native order mode.
1128
1129*/
1130
Mark Hammond91a681d2002-08-12 07:21:58 +00001131PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001132 const char *string, /* UTF-16 encoded string */
1133 Py_ssize_t length, /* size of string */
1134 const char *errors, /* error handling */
1135 int *byteorder /* pointer to byteorder to use
1136 0=native;-1=LE,1=BE; updated on
1137 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001138 );
1139
Walter Dörwald69652032004-09-07 20:24:22 +00001140PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 const char *string, /* UTF-16 encoded string */
1142 Py_ssize_t length, /* size of string */
1143 const char *errors, /* error handling */
1144 int *byteorder, /* pointer to byteorder to use
1145 0=native;-1=LE,1=BE; updated on
1146 exit */
1147 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001148 );
1149
Guido van Rossumd8225182000-03-10 22:33:05 +00001150/* Returns a Python string using the UTF-16 encoding in native byte
1151 order. The string always starts with a BOM mark. */
1152
Mark Hammond91a681d2002-08-12 07:21:58 +00001153PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001154 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001155 );
1156
1157/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001158 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001159
1160 If byteorder is not 0, output is written according to the following
1161 byte order:
1162
1163 byteorder == -1: little endian
1164 byteorder == 0: native byte order (writes a BOM mark)
1165 byteorder == 1: big endian
1166
1167 If byteorder is 0, the output string will always start with the
1168 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1169 prepended.
1170
1171 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1172 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001173 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001174
1175*/
1176
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001177#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001178PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 const Py_UNICODE *data, /* Unicode char buffer */
1180 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1181 const char *errors, /* error handling */
1182 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001183 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001184#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001185
1186/* --- Unicode-Escape Codecs ---------------------------------------------- */
1187
Mark Hammond91a681d2002-08-12 07:21:58 +00001188PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 const char *string, /* Unicode-Escape encoded string */
1190 Py_ssize_t length, /* size of string */
1191 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001192 );
1193
Mark Hammond91a681d2002-08-12 07:21:58 +00001194PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001196 );
1197
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001198#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001199PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 const Py_UNICODE *data, /* Unicode char buffer */
1201 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001202 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001203#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001204
1205/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1206
Mark Hammond91a681d2002-08-12 07:21:58 +00001207PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 const char *string, /* Raw-Unicode-Escape encoded string */
1209 Py_ssize_t length, /* size of string */
1210 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001211 );
1212
Mark Hammond91a681d2002-08-12 07:21:58 +00001213PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001215 );
1216
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001217#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001218PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 const Py_UNICODE *data, /* Unicode char buffer */
1220 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001221 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001222#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001223
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001224/* --- Unicode Internal Codec ---------------------------------------------
1225
1226 Only for internal use in _codecsmodule.c */
1227
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001228#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001229PyObject *_PyUnicode_DecodeUnicodeInternal(
1230 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001232 const char *errors
1233 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001234#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001235
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001236/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001237
1238 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1239
1240*/
1241
Mark Hammond91a681d2002-08-12 07:21:58 +00001242PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 const char *string, /* Latin-1 encoded string */
1244 Py_ssize_t length, /* size of string */
1245 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001246 );
1247
Mark Hammond91a681d2002-08-12 07:21:58 +00001248PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001250 );
1251
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001252#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1254 PyObject* unicode,
1255 const char* errors);
1256
Mark Hammond91a681d2002-08-12 07:21:58 +00001257PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 const Py_UNICODE *data, /* Unicode char buffer */
1259 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1260 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001261 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001262#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001263
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001265
1266 Only 7-bit ASCII data is excepted. All other codes generate errors.
1267
1268*/
1269
Mark Hammond91a681d2002-08-12 07:21:58 +00001270PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 const char *string, /* ASCII encoded string */
1272 Py_ssize_t length, /* size of string */
1273 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001274 );
1275
Mark Hammond91a681d2002-08-12 07:21:58 +00001276PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001278 );
1279
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001280#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1282 PyObject* unicode,
1283 const char* errors);
1284
Mark Hammond91a681d2002-08-12 07:21:58 +00001285PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286 const Py_UNICODE *data, /* Unicode char buffer */
1287 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1288 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001289 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001290#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001291
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001293
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001295
1296 Decoding mappings must map single string characters to single
1297 Unicode characters, integers (which are then interpreted as Unicode
1298 ordinals) or None (meaning "undefined mapping" and causing an
1299 error).
1300
1301 Encoding mappings must map single Unicode characters to single
1302 string characters, integers (which are then interpreted as Latin-1
1303 ordinals) or None (meaning "undefined mapping" and causing an
1304 error).
1305
1306 If a character lookup fails with a LookupError, the character is
1307 copied as-is meaning that its ordinal value will be interpreted as
1308 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1309 to contain those mappings which map characters to different code
1310 points.
1311
1312*/
1313
Mark Hammond91a681d2002-08-12 07:21:58 +00001314PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 const char *string, /* Encoded string */
1316 Py_ssize_t length, /* size of string */
1317 PyObject *mapping, /* character mapping
1318 (char ordinal -> unicode ordinal) */
1319 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001320 );
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001323 PyObject *unicode, /* Unicode object */
1324 PyObject *mapping /* character mapping
1325 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001326 );
1327
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001328#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001329PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 const Py_UNICODE *data, /* Unicode char buffer */
1331 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1332 PyObject *mapping, /* character mapping
1333 (unicode ordinal -> char ordinal) */
1334 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001335 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001336#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001337
1338/* Translate a Py_UNICODE buffer of the given length by applying a
1339 character mapping table to it and return the resulting Unicode
1340 object.
1341
1342 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001344
1345 Mapping tables may be dictionaries or sequences. Unmapped character
1346 ordinals (ones which cause a LookupError) are left untouched and
1347 are copied as-is.
1348
1349*/
1350
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001351#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001352PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001353 const Py_UNICODE *data, /* Unicode char buffer */
1354 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1355 PyObject *table, /* Translate table */
1356 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001357 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001358#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001359
Victor Stinner99b95382011-07-04 14:23:54 +02001360#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001361
Guido van Rossumefec1152000-03-28 02:01:15 +00001362/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001363
Mark Hammond91a681d2002-08-12 07:21:58 +00001364PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001365 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001366 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001367 const char *errors /* error handling */
1368 );
1369
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001370PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1371 const char *string, /* MBCS encoded string */
1372 Py_ssize_t length, /* size of string */
1373 const char *errors, /* error handling */
1374 Py_ssize_t *consumed /* bytes consumed */
1375 );
1376
Mark Hammond91a681d2002-08-12 07:21:58 +00001377PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001378 PyObject *unicode /* Unicode object */
1379 );
1380
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001381#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001382PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001383 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001384 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001385 const char *errors /* error handling */
1386 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001387#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001388
Victor Stinner99b95382011-07-04 14:23:54 +02001389#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001390
Guido van Rossum9e896b32000-04-05 20:11:21 +00001391/* --- Decimal Encoder ---------------------------------------------------- */
1392
1393/* Takes a Unicode string holding a decimal value and writes it into
1394 an output buffer using standard ASCII digit codes.
1395
1396 The output buffer has to provide at least length+1 bytes of storage
1397 area. The output string is 0-terminated.
1398
1399 The encoder converts whitespace to ' ', decimal characters to their
1400 corresponding ASCII digit and all other Latin-1 characters except
1401 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1402 are treated as errors. This includes embedded NULL bytes.
1403
1404 Error handling is defined by the errors argument:
1405
1406 NULL or "strict": raise a ValueError
1407 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001409 "replace": replaces illegal characters with '?'
1410
1411 Returns 0 on success, -1 on failure.
1412
1413*/
1414
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001415#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001416PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 Py_UNICODE *s, /* Unicode buffer */
1418 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1419 char *output, /* Output buffer; must have size >= length */
1420 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001421 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001422#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001423
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001424/* Transforms code points that have decimal digit property to the
1425 corresponding ASCII digit code points.
1426
1427 Returns a new Unicode string on success, NULL on failure.
1428*/
1429
Georg Brandlb5503082010-12-05 11:40:48 +00001430#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001431PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1432 Py_UNICODE *s, /* Unicode buffer */
1433 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1434 );
Georg Brandlb5503082010-12-05 11:40:48 +00001435#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1438 as argument instead of a raw buffer and length. This function additionally
1439 transforms spaces to ASCII because this is what the callers in longobject,
1440 floatobject, and complexobject did anyways. */
1441
1442#ifndef Py_LIMITED_API
1443PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1444 PyObject *unicode /* Unicode object */
1445 );
1446#endif
1447
Martin v. Löwis011e8422009-05-05 04:43:17 +00001448/* --- File system encoding ---------------------------------------------- */
1449
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001450/* ParseTuple converter: encode str objects to bytes using
1451 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001452
1453PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1454
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001455/* ParseTuple converter: decode bytes objects to unicode using
1456 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1457
1458PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1459
Victor Stinner77c38622010-05-14 15:58:55 +00001460/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1461 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001462
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001463 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1464 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001465
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001466 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001467*/
1468
1469PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1470 const char *s /* encoded string */
1471 );
1472
Victor Stinner77c38622010-05-14 15:58:55 +00001473/* Decode a string using Py_FileSystemDefaultEncoding
1474 and the "surrogateescape" error handler.
1475
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001476 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1477 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001478*/
1479
Martin v. Löwis011e8422009-05-05 04:43:17 +00001480PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1481 const char *s, /* encoded string */
1482 Py_ssize_t size /* size */
1483 );
1484
Victor Stinnerae6265f2010-05-15 16:27:27 +00001485/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001486 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001487
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001488 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1489 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001490*/
1491
1492PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1493 PyObject *unicode
1494 );
1495
Guido van Rossumd8225182000-03-10 22:33:05 +00001496/* --- Methods & Slots ----------------------------------------------------
1497
1498 These are capable of handling Unicode objects and strings on input
1499 (we refer to them as strings in the descriptions) and return
1500 Unicode objects or integers as apporpriate. */
1501
1502/* Concat two strings giving a new Unicode string. */
1503
Mark Hammond91a681d2002-08-12 07:21:58 +00001504PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001505 PyObject *left, /* Left string */
1506 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001507 );
1508
Walter Dörwald1ab83302007-05-18 17:15:44 +00001509/* Concat two strings and put the result in *pleft
1510 (sets *pleft to NULL on error) */
1511
1512PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 PyObject **pleft, /* Pointer to left string */
1514 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001515 );
1516
1517/* Concat two strings, put the result in *pleft and drop the right object
1518 (sets *pleft to NULL on error) */
1519
1520PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001521 PyObject **pleft, /* Pointer to left string */
1522 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001523 );
1524
Guido van Rossumd8225182000-03-10 22:33:05 +00001525/* Split a string giving a list of Unicode strings.
1526
1527 If sep is NULL, splitting will be done at all whitespace
1528 substrings. Otherwise, splits occur at the given separator.
1529
1530 At most maxsplit splits will be done. If negative, no limit is set.
1531
1532 Separators are not included in the resulting list.
1533
1534*/
1535
Mark Hammond91a681d2002-08-12 07:21:58 +00001536PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 PyObject *s, /* String to split */
1538 PyObject *sep, /* String separator */
1539 Py_ssize_t maxsplit /* Maxsplit count */
1540 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001541
1542/* Dito, but split at line breaks.
1543
1544 CRLF is considered to be one line break. Line breaks are not
1545 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001546
Mark Hammond91a681d2002-08-12 07:21:58 +00001547PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001548 PyObject *s, /* String to split */
1549 int keepends /* If true, line end markers are included */
1550 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001551
Thomas Wouters477c8d52006-05-27 19:21:47 +00001552/* Partition a string using a given separator. */
1553
1554PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 PyObject *s, /* String to partition */
1556 PyObject *sep /* String separator */
1557 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001558
1559/* Partition a string using a given separator, searching from the end of the
1560 string. */
1561
1562PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 PyObject *s, /* String to partition */
1564 PyObject *sep /* String separator */
1565 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001566
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001567/* Split a string giving a list of Unicode strings.
1568
1569 If sep is NULL, splitting will be done at all whitespace
1570 substrings. Otherwise, splits occur at the given separator.
1571
1572 At most maxsplit splits will be done. But unlike PyUnicode_Split
1573 PyUnicode_RSplit splits from the end of the string. If negative,
1574 no limit is set.
1575
1576 Separators are not included in the resulting list.
1577
1578*/
1579
1580PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001581 PyObject *s, /* String to split */
1582 PyObject *sep, /* String separator */
1583 Py_ssize_t maxsplit /* Maxsplit count */
1584 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001585
Guido van Rossumd8225182000-03-10 22:33:05 +00001586/* Translate a string by applying a character mapping table to it and
1587 return the resulting Unicode object.
1588
1589 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001590 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001591
1592 Mapping tables may be dictionaries or sequences. Unmapped character
1593 ordinals (ones which cause a LookupError) are left untouched and
1594 are copied as-is.
1595
1596*/
1597
Mark Hammond91a681d2002-08-12 07:21:58 +00001598PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001599 PyObject *str, /* String */
1600 PyObject *table, /* Translate table */
1601 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001602 );
1603
1604/* Join a sequence of strings using the given separator and return
1605 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001606
Mark Hammond91a681d2002-08-12 07:21:58 +00001607PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001608 PyObject *separator, /* Separator string */
1609 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001610 );
1611
1612/* Return 1 if substr matches str[start:end] at the given tail end, 0
1613 otherwise. */
1614
Martin v. Löwis18e16552006-02-15 17:27:45 +00001615PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001616 PyObject *str, /* String */
1617 PyObject *substr, /* Prefix or Suffix string */
1618 Py_ssize_t start, /* Start index */
1619 Py_ssize_t end, /* Stop index */
1620 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001621 );
1622
1623/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001624 given search direction or -1 if not found. -2 is returned in case
1625 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001626
Martin v. Löwis18e16552006-02-15 17:27:45 +00001627PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001628 PyObject *str, /* String */
1629 PyObject *substr, /* Substring to find */
1630 Py_ssize_t start, /* Start index */
1631 Py_ssize_t end, /* Stop index */
1632 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001633 );
1634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635/* Like PyUnicode_Find, but search for single character only. */
1636PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1637 PyObject *str,
1638 Py_UCS4 ch,
1639 Py_ssize_t start,
1640 Py_ssize_t end,
1641 int direction
1642 );
1643
Barry Warsaw51ac5802000-03-20 16:36:48 +00001644/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001645
Martin v. Löwis18e16552006-02-15 17:27:45 +00001646PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001647 PyObject *str, /* String */
1648 PyObject *substr, /* Substring to count */
1649 Py_ssize_t start, /* Start index */
1650 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001651 );
1652
Barry Warsaw51ac5802000-03-20 16:36:48 +00001653/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001654 and return the resulting Unicode object. */
1655
Mark Hammond91a681d2002-08-12 07:21:58 +00001656PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001657 PyObject *str, /* String */
1658 PyObject *substr, /* Substring to find */
1659 PyObject *replstr, /* Substring to replace */
1660 Py_ssize_t maxcount /* Max. number of replacements to apply;
1661 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001662 );
1663
1664/* Compare two strings and return -1, 0, 1 for less than, equal,
1665 greater than resp. */
1666
Mark Hammond91a681d2002-08-12 07:21:58 +00001667PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001668 PyObject *left, /* Left string */
1669 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001670 );
1671
Martin v. Löwis5b222132007-06-10 09:51:05 +00001672PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1673 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001674 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001675 );
1676
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001677/* Rich compare two strings and return one of the following:
1678
1679 - NULL in case an exception was raised
1680 - Py_True or Py_False for successfuly comparisons
1681 - Py_NotImplemented in case the type combination is unknown
1682
1683 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1684 case the conversion of the arguments to Unicode fails with a
1685 UnicodeDecodeError.
1686
1687 Possible values for op:
1688
1689 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1690
1691*/
1692
1693PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001694 PyObject *left, /* Left string */
1695 PyObject *right, /* Right string */
1696 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001697 );
1698
Thomas Wouters7e474022000-07-16 12:04:32 +00001699/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001700 the resulting Unicode string. */
1701
Mark Hammond91a681d2002-08-12 07:21:58 +00001702PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001703 PyObject *format, /* Format string */
1704 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001705 );
1706
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001707/* Checks whether element is contained in container and return 1/0
1708 accordingly.
1709
1710 element has to coerce to an one element Unicode string. -1 is
1711 returned in case of an error. */
1712
Mark Hammond91a681d2002-08-12 07:21:58 +00001713PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001714 PyObject *container, /* Container string */
1715 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001716 );
1717
Martin v. Löwis47383402007-08-15 07:32:56 +00001718/* Checks whether argument is a valid identifier. */
1719
1720PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1721
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001722#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001723/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001724PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001725 PyUnicodeObject *self,
1726 int striptype,
1727 PyObject *sepobj
1728 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001729#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001730
Eric Smith5807c412008-05-11 21:00:57 +00001731/* Using the current locale, insert the thousands grouping
1732 into the string pointed to by buffer. For the argument descriptions,
1733 see Objects/stringlib/localeutil.h */
1734
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001735#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001736PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1737 Py_ssize_t n_buffer,
1738 Py_UNICODE *digits,
1739 Py_ssize_t n_digits,
1740 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001741#endif
Eric Smith5807c412008-05-11 21:00:57 +00001742
Eric Smitha3b1ac82009-04-03 14:45:06 +00001743/* Using explicit passed-in values, insert the thousands grouping
1744 into the string pointed to by buffer. For the argument descriptions,
1745 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001746#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1748 int kind,
1749 void *buffer,
1750 Py_ssize_t n_buffer,
1751 void *digits,
1752 Py_ssize_t n_digits,
1753 Py_ssize_t min_width,
1754 const char *grouping,
1755 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001756#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001757/* === Characters Type APIs =============================================== */
1758
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001759/* Helper array used by Py_UNICODE_ISSPACE(). */
1760
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001761#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001762PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1763
Guido van Rossumd8225182000-03-10 22:33:05 +00001764/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001765 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001766
1767 These APIs are implemented in Objects/unicodectype.c.
1768
1769*/
1770
Mark Hammond91a681d2002-08-12 07:21:58 +00001771PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001772 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001773 );
1774
Mark Hammond91a681d2002-08-12 07:21:58 +00001775PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001776 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001777 );
1778
Mark Hammond91a681d2002-08-12 07:21:58 +00001779PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001780 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001781 );
1782
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001783PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001784 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001785 );
1786
1787PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001788 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001789 );
1790
Mark Hammond91a681d2002-08-12 07:21:58 +00001791PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001792 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001793 );
1794
Mark Hammond91a681d2002-08-12 07:21:58 +00001795PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001796 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001797 );
1798
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001799PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1800 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001801 );
1802
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001803PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1804 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001805 );
1806
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001807PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1808 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001809 );
1810
Mark Hammond91a681d2002-08-12 07:21:58 +00001811PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001812 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001813 );
1814
Mark Hammond91a681d2002-08-12 07:21:58 +00001815PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001816 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001817 );
1818
Mark Hammond91a681d2002-08-12 07:21:58 +00001819PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001820 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001821 );
1822
Mark Hammond91a681d2002-08-12 07:21:58 +00001823PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001824 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001825 );
1826
Mark Hammond91a681d2002-08-12 07:21:58 +00001827PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001828 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001829 );
1830
Mark Hammond91a681d2002-08-12 07:21:58 +00001831PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001832 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001833 );
1834
Georg Brandl559e5d72008-06-11 18:37:52 +00001835PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001836 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001837 );
1838
Mark Hammond91a681d2002-08-12 07:21:58 +00001839PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001840 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001841 );
1842
Victor Stinneref8d95c2010-08-16 22:03:11 +00001843PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1844 const Py_UNICODE *u
1845 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001846
1847PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001848 Py_UNICODE *s1,
1849 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001850
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001851PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1852 Py_UNICODE *s1, const Py_UNICODE *s2);
1853
Martin v. Löwis5b222132007-06-10 09:51:05 +00001854PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001855 Py_UNICODE *s1,
1856 const Py_UNICODE *s2,
1857 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001858
1859PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001860 const Py_UNICODE *s1,
1861 const Py_UNICODE *s2
1862 );
1863
1864PyAPI_FUNC(int) Py_UNICODE_strncmp(
1865 const Py_UNICODE *s1,
1866 const Py_UNICODE *s2,
1867 size_t n
1868 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001869
1870PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001871 const Py_UNICODE *s,
1872 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001873 );
1874
Victor Stinner331ea922010-08-10 16:37:20 +00001875PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001876 const Py_UNICODE *s,
1877 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001878 );
1879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880PyAPI_FUNC(size_t) Py_UCS4_strlen(
1881 const Py_UCS4 *u
1882 );
1883
1884PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1885 Py_UCS4 *s1,
1886 const Py_UCS4 *s2);
1887
1888PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1889 Py_UCS4 *s1, const Py_UCS4 *s2);
1890
1891PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1892 Py_UCS4 *s1,
1893 const Py_UCS4 *s2,
1894 size_t n);
1895
1896PyAPI_FUNC(int) Py_UCS4_strcmp(
1897 const Py_UCS4 *s1,
1898 const Py_UCS4 *s2
1899 );
1900
1901PyAPI_FUNC(int) Py_UCS4_strncmp(
1902 const Py_UCS4 *s1,
1903 const Py_UCS4 *s2,
1904 size_t n
1905 );
1906
1907PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1908 const Py_UCS4 *s,
1909 Py_UCS4 c
1910 );
1911
1912PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1913 const Py_UCS4 *s,
1914 Py_UCS4 c
1915 );
1916
Victor Stinner71133ff2010-09-01 23:43:53 +00001917/* Create a copy of a unicode string ending with a nul character. Return NULL
1918 and raise a MemoryError exception on memory allocation failure, otherwise
1919 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1920
Victor Stinner46408602010-09-03 16:18:00 +00001921PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001922 PyObject *unicode
1923 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001924#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001925
Guido van Rossumd8225182000-03-10 22:33:05 +00001926#ifdef __cplusplus
1927}
1928#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001929#endif /* !Py_UNICODEOBJECT_H */