blob: a2c07f5199f3981dcccd7b9f4c8ee548330e0b6a [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
209 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200210 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000211 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212 struct {
213 /*
214 SSTATE_NOT_INTERNED (0)
215 SSTATE_INTERNED_MORTAL (1)
216 SSTATE_INTERNED_IMMORTAL (2)
217
218 If interned != SSTATE_NOT_INTERNED, the two references from the
219 dictionary to this object are *not* counted in ob_refcnt.
220 */
221 unsigned int interned:2;
222 /* Character size:
223
224 PyUnicode_WCHAR_KIND (0): wchar_t*
225 PyUnicode_1BYTE_KIND (1): Py_UCS1*
226 PyUnicode_2BYTE_KIND (2): Py_UCS2*
227 PyUnicode_4BYTE_KIND (3): Py_UCS4*
228 */
229 unsigned int kind:2;
230 /* Compact is with respect to the allocation scheme. Compact unicode
231 objects only require one memory block while non-compact objects use
232 one block for the PyUnicodeObject struct and another for its data
233 buffer. */
234 unsigned int compact:1;
235 /* Compact objects which are ASCII-only also have the state.compact
236 flag set, and use the PyASCIIObject struct. */
237 unsigned int ascii:1;
238 /* The ready flag indicates whether the object layout is initialized
239 completely. This means that this is either a compact object, or
240 the data pointer is filled out. The bit is redundant, and helps
241 to minimize the test in PyUnicode_IS_READY(). */
242 unsigned int ready:1;
243 } state;
244 wchar_t *wstr; /* wchar_t representation (null-terminated) */
245} PyASCIIObject;
246
247/* Non-ASCII strings allocated through PyUnicode_New use the
248 PyCompactUnicodeOject structure. state.compact is set, and the data
249 immediately follow the structure. */
250typedef struct {
251 PyASCIIObject _base;
252 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
253 * terminating \0. */
254 char *utf8; /* UTF-8 representation (null-terminated) */
255 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
256 * surrogates count as two code points. */
257} PyCompactUnicodeObject;
258
259/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
260 PyUnicodeObject structure. The actual string data is initially in the wstr
261 block, and copied into the data block using PyUnicode_Ready. */
262typedef struct {
263 PyCompactUnicodeObject _base;
264 union {
265 void *any;
266 Py_UCS1 *latin1;
267 Py_UCS2 *ucs2;
268 Py_UCS4 *ucs4;
269 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000270} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000271#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000272
Mark Hammond91a681d2002-08-12 07:21:58 +0000273PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000274PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000275
Thomas Wouters27d517b2007-02-25 20:39:11 +0000276#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000277 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
278#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000279
280/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000281#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282
283#define PyUnicode_WSTR_LENGTH(op) \
284 (((PyASCIIObject*)op)->state.ascii ? \
285 ((PyASCIIObject*)op)->length : \
286 ((PyCompactUnicodeObject*)op)->wstr_length)
287
288/* Returns the deprecated Py_UNICODE representation's size in code units
289 (this includes surrogate pairs as 2 units).
290 If the Py_UNICODE representation is not available, it will be computed
291 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
292
Guido van Rossumd8225182000-03-10 22:33:05 +0000293#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 (assert(PyUnicode_Check(op)), \
295 (((PyASCIIObject *)(op))->wstr) ? \
296 PyUnicode_WSTR_LENGTH(op) : \
297 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
298 PyUnicode_WSTR_LENGTH(op)))
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
302
303/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
304 representation on demand. Using this macro is very inefficient now,
305 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
306 use PyUnicode_WRITE() and PyUnicode_READ(). */
307
Guido van Rossumd8225182000-03-10 22:33:05 +0000308#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 (assert(PyUnicode_Check(op)), \
310 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
311 PyUnicode_AsUnicode((PyObject *)(op)))
312
Guido van Rossumd8225182000-03-10 22:33:05 +0000313#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 ((const char *)(PyUnicode_AS_UNICODE(op)))
315
316
317/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
318
319/* Values for PyUnicodeObject.state: */
320
321/* Interning state. */
322#define SSTATE_NOT_INTERNED 0
323#define SSTATE_INTERNED_MORTAL 1
324#define SSTATE_INTERNED_IMMORTAL 2
325
326#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
327
328/* String contains only wstr byte characters. This is only possible
329 when the string was created with a legacy API and PyUnicode_Ready()
330 has not been called yet. */
331#define PyUnicode_WCHAR_KIND 0
332
333/* Return values of the PyUnicode_KIND() macro: */
334
335#define PyUnicode_1BYTE_KIND 1
336#define PyUnicode_2BYTE_KIND 2
337#define PyUnicode_4BYTE_KIND 3
338
339
340/* Return the number of bytes the string uses to represent single characters,
341 this can be 1, 2 or 4. */
342#define PyUnicode_CHARACTER_SIZE(op) \
343 (1 << (PyUnicode_KIND(op) - 1))
344
345/* Return pointers to the canonical representation casted as unsigned char,
346 Py_UCS2, or Py_UCS4 for direct character access.
347 No checks are performed, use PyUnicode_CHARACTER_SIZE or
348 PyUnicode_KIND() before to ensure these will work correctly. */
349
350#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
351#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
352#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
353
354/* Return true if the string is compact or 0 if not.
355 No type checks or Ready calls are performed. */
356#define PyUnicode_IS_COMPACT(op) \
357 (((PyASCIIObject*)(op))->state.compact)
358
Victor Stinner157f83f2011-09-28 21:41:31 +0200359/* Return one of the PyUnicode_*_KIND values defined above. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200360#define PyUnicode_KIND(op) \
361 (assert(PyUnicode_Check(op)), \
362 assert(PyUnicode_IS_READY(op)), \
363 ((PyASCIIObject *)(op))->state.kind)
364
Victor Stinner157f83f2011-09-28 21:41:31 +0200365/* Return a void pointer to the raw unicode buffer. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366#define _PyUnicode_COMPACT_DATA(op) \
367 (PyUnicode_IS_COMPACT_ASCII(op) ? \
368 ((void*)((PyASCIIObject*)(op) + 1)) : \
369 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
370
371#define _PyUnicode_NONCOMPACT_DATA(op) \
372 (assert(((PyUnicodeObject*)(op))->data.any), \
373 ((((PyUnicodeObject *)(op))->data.any)))
374
375#define PyUnicode_DATA(op) \
376 (assert(PyUnicode_Check(op)), \
377 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
378 _PyUnicode_NONCOMPACT_DATA(op))
379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
381
382 The index is a character index, the result is a size in bytes. */
383#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
384
385/* In the access macros below, "kind" may be evaluated more than once.
386 All other macro parameters are evaluated exactly once, so it is safe
387 to put side effects into them (such as increasing the index). */
388
389/* Write into the canonical representation, this macro does not do any sanity
390 checks and is intended for usage in loops. The caller should cache the
391 kind and data pointers optained form other macro calls.
392 index is the index in the string (starts at 0) and value is the new
393 code point value which shoule be written to that location. */
394#define PyUnicode_WRITE(kind, data, index, value) \
395 do { \
396 switch ((kind)) { \
397 case PyUnicode_1BYTE_KIND: { \
398 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
399 break; \
400 } \
401 case PyUnicode_2BYTE_KIND: { \
402 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
403 break; \
404 } \
405 default: { \
406 assert((kind) == PyUnicode_4BYTE_KIND); \
407 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
408 } \
409 } \
410 } while (0)
411
412/* Read a code point form the string's canonical representation. No checks
413 or ready calls are performed. */
414#define PyUnicode_READ(kind, data, index) \
415 ((Py_UCS4) \
416 ((kind) == PyUnicode_1BYTE_KIND ? \
417 ((const unsigned char *)(data))[(index)] : \
418 ((kind) == PyUnicode_2BYTE_KIND ? \
419 ((const Py_UCS2 *)(data))[(index)] : \
420 ((const Py_UCS4 *)(data))[(index)] \
421 ) \
422 ))
423
424/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
425 calls PyUnicode_KIND() and might call it twice. For single reads, use
426 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
427 cache kind and use PyUnicode_READ instead. */
428#define PyUnicode_READ_CHAR(unicode, index) \
429 ((Py_UCS4) \
430 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
431 ((const unsigned char *)(PyUnicode_DATA((unicode))))[(index)] : \
432 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
433 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
434 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
435 ) \
436 ))
437
438/* Returns the length of the unicode string. The caller has to make sure that
439 the string has it's canonical representation set before calling
440 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
441#define PyUnicode_GET_LENGTH(op) \
442 (assert(PyUnicode_Check(op)), \
443 assert(PyUnicode_IS_READY(op)), \
444 ((PyASCIIObject *)(op))->length)
445
446
447/* Fast check to determine whether an object is ready. Equivalent to
448 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
449
450#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
451
452/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
453 case. If the canonical representation is not yet set, it will still call
454 PyUnicode_Ready().
455 Returns 0 on success and -1 on errors. */
456#define PyUnicode_READY(op) \
457 (assert(PyUnicode_Check(op)), \
458 (PyUnicode_IS_READY(op) ? \
Victor Stinnerd8f65102011-09-29 19:43:17 +0200459 0 : _PyUnicode_Ready((PyObject *)(op))))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461/* Return a maximum character value which is suitable for creating another
462 string based on op. This is always an approximation but more efficient
463 than interating over the string. */
464#define PyUnicode_MAX_CHAR_VALUE(op) \
465 (assert(PyUnicode_IS_READY(op)), \
466 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
467 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
468 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
469 (0x7fU) : (0xffU) \
470 ) : \
471 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
472 (0xffffU) : (0x10ffffU) \
473 ))))
474
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000475#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000476
477/* --- Constants ---------------------------------------------------------- */
478
479/* This Unicode character will be used as replacement character during
480 decoding if the errors argument is set to "replace". Note: the
481 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
482 Unicode 3.0. */
483
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200484#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000485
486/* === Public API ========================================================= */
487
488/* --- Plain Py_UNICODE --------------------------------------------------- */
489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490/* With PEP 393, this is the recommended way to allocate a new unicode object.
491 This function will allocate the object and its buffer in a single memory
492 block. Objects created using this function are not resizable. */
493#ifndef Py_LIMITED_API
494PyAPI_FUNC(PyObject*) PyUnicode_New(
495 Py_ssize_t size, /* Number of code points in the new string */
496 Py_UCS4 maxchar /* maximum code point value in the string */
497 );
498#endif
499
Victor Stinnerd8f65102011-09-29 19:43:17 +0200500/* Initializes the canonical string representation from a the deprecated
501 wstr/Py_UNICODE representation. This function is used to convert Unicode
502 objects which were created using the old API to the new flexible format
503 introduced with PEP 393.
504
505 Don't call this function directly, use the public PyUnicode_READY() macro
506 instead. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507#ifndef Py_LIMITED_API
508PyAPI_FUNC(int) _PyUnicode_Ready(
Victor Stinnerd8f65102011-09-29 19:43:17 +0200509 PyObject *unicode /* Unicode object */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200510 );
511#endif
512
513/* Copy character from one unicode object into another, this function performs
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200514 character conversion when necessary and falls back to memcpy if possible.
515
Victor Stinnera0702ab2011-09-29 14:14:38 +0200516 Fail if to is too small (smaller than how_many or smaller than
517 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
518 kind(to), or if to has more than 1 reference.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200519
520 Return the number of written character, or return -1 and raise an exception
521 on error.
522
523 Pseudo-code:
524
525 how_many = min(how_many, len(from) - from_start)
526 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
527 return how_many
Victor Stinnera0702ab2011-09-29 14:14:38 +0200528
529 Note: The function doesn't write a terminating null character.
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200530 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200531#ifndef Py_LIMITED_API
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200532PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 PyObject *to,
534 Py_ssize_t to_start,
535 PyObject *from,
536 Py_ssize_t from_start,
537 Py_ssize_t how_many
538 );
539#endif
540
Guido van Rossumd8225182000-03-10 22:33:05 +0000541/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000543
544 u may be NULL which causes the contents to be undefined. It is the
545 user's responsibility to fill in the needed data afterwards. Note
546 that modifying the Unicode object contents after construction is
547 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000548
549 The buffer is copied into the new object. */
550
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000551#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000552PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000553 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000554 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000555 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000556#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000557
Georg Brandl952867a2010-06-27 10:17:12 +0000558/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000560 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000561 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000562 );
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000567 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000568 );
569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200570#ifndef Py_LIMITED_API
571PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
572 int kind,
573 const void *buffer,
574 Py_ssize_t size);
575#endif
576
577PyAPI_FUNC(PyObject*) PyUnicode_Substring(
578 PyObject *str,
579 Py_ssize_t start,
580 Py_ssize_t end);
581
582/* Copy the string into a UCS4 buffer including the null character is copy_null
583 is set. Return NULL and raise an exception on error. Raise a ValueError if
584 the buffer is smaller than the string. Return buffer on success.
585
586 buflen is the length of the buffer in (Py_UCS4) characters. */
587PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
588 PyObject *unicode,
589 Py_UCS4* buffer,
590 Py_ssize_t buflen,
591 int copy_null);
592
593/* Copy the string into a UCS4 buffer. A new buffer is allocated using
594 * PyMem_Malloc; if this fails, NULL is returned with a memory error
595 exception set. */
596PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
597
Guido van Rossumd8225182000-03-10 22:33:05 +0000598/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200599 Py_UNICODE buffer.
600 If the wchar_t/Py_UNICODE representation is not yet available, this
601 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000602
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000603#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000604PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000605 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000606 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000607#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609/* Return a read-only pointer to the Unicode object's internal
610 Py_UNICODE buffer and save the length at size.
611 If the wchar_t/Py_UNICODE representation is not yet available, this
612 function will calculate it. */
613
614#ifndef Py_LIMITED_API
615PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
616 PyObject *unicode, /* Unicode object */
617 Py_ssize_t *size /* location where to save the length */
618 );
619#endif
620
Guido van Rossumd8225182000-03-10 22:33:05 +0000621/* Get the length of the Unicode object. */
622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
624 PyObject *unicode
625);
626
Victor Stinner157f83f2011-09-28 21:41:31 +0200627/* Get the number of Py_UNICODE units in the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628 string representation. */
629
Martin v. Löwis18e16552006-02-15 17:27:45 +0000630PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000632 );
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634/* Read a character from the string. */
635
636PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
637 PyObject *unicode,
638 Py_ssize_t index
639 );
640
641/* Write a character to the string. The string must have been created through
642 PyUnicode_New, must not be shared, and must not have been hashed yet. */
643
644PyAPI_FUNC(int) PyUnicode_WriteChar(
645 PyObject *unicode,
646 Py_ssize_t index,
647 Py_UCS4 character
648 );
649
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000650#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000651/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000652PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000653#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000654
Guido van Rossum52c23592000-04-10 13:41:41 +0000655/* Resize an already allocated Unicode object to the new size length.
656
657 *unicode is modified to point to the new (resized) object and 0
658 returned on success.
659
660 This API may only be called by the function which also called the
661 Unicode constructor. The refcount on the object must be 1. Otherwise,
662 an error is returned.
663
664 Error handling is implemented as follows: an exception is set, -1
665 is returned and *unicode left untouched.
666
667*/
668
Mark Hammond91a681d2002-08-12 07:21:58 +0000669PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject **unicode, /* Pointer to the Unicode object */
671 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000672 );
673
Guido van Rossumd8225182000-03-10 22:33:05 +0000674/* Coerce obj to an Unicode object and return a reference with
675 *incremented* refcount.
676
677 Coercion is done in the following way:
678
Georg Brandl952867a2010-06-27 10:17:12 +0000679 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000680 under the assumptions that they contain data using the UTF-8
681 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000682
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000683 2. All other objects (including Unicode objects) raise an
684 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000685
686 The API returns NULL in case of an error. The caller is responsible
687 for decref'ing the returned objects.
688
689*/
690
Mark Hammond91a681d2002-08-12 07:21:58 +0000691PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000692 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000693 const char *encoding, /* encoding */
694 const char *errors /* error handling */
695 );
696
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000697/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000698 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000700 Unicode objects are passed back as-is (subclasses are converted to
701 true Unicode objects), all other objects are delegated to
702 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000703 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000704
705 The API returns NULL in case of an error. The caller is responsible
706 for decref'ing the returned objects.
707
708*/
709
Mark Hammond91a681d2002-08-12 07:21:58 +0000710PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000712 );
713
Victor Stinner1205f272010-09-11 00:54:47 +0000714PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
715 const char *format, /* ASCII-encoded string */
716 va_list vargs
717 );
718PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
719 const char *format, /* ASCII-encoded string */
720 ...
721 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000723#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000724/* Format the object based on the format_spec, as defined in PEP 3101
725 (Advanced String Formatting). */
726PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 PyObject *format_spec,
728 Py_ssize_t start,
729 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000730#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000731
Walter Dörwald16807132007-05-25 13:52:07 +0000732PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
733PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000734PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
735 const char *u /* UTF-8 encoded string */
736 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000737#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000738PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000739#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000740
741/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742#define PyUnicode_CHECK_INTERNED(op) \
743 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000744
Guido van Rossumd8225182000-03-10 22:33:05 +0000745/* --- wchar_t support for platforms which support it --------------------- */
746
747#ifdef HAVE_WCHAR_H
748
Georg Brandl952867a2010-06-27 10:17:12 +0000749/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000750 size.
751
752 The buffer is copied into the new object. */
753
Mark Hammond91a681d2002-08-12 07:21:58 +0000754PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000755 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000756 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000757 );
758
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000759/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000760 most size wchar_t characters are copied.
761
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000762 Note that the resulting wchar_t string may or may not be
763 0-terminated. It is the responsibility of the caller to make sure
764 that the wchar_t string is 0-terminated in case this is required by
765 the application.
766
767 Returns the number of wchar_t characters copied (excluding a
768 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000769 error. */
770
Martin v. Löwis18e16552006-02-15 17:27:45 +0000771PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000772 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000773 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000774 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000775 );
776
Victor Stinner137c34c2010-09-29 10:25:54 +0000777/* Convert the Unicode object to a wide character string. The output string
778 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200779 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000780
781 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
782 on success. On error, returns NULL, *size is undefined and raises a
783 MemoryError. */
784
785PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000786 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000787 Py_ssize_t *size /* number of characters of the result */
788 );
789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
791
Guido van Rossumd8225182000-03-10 22:33:05 +0000792#endif
793
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000794/* --- Unicode ordinals --------------------------------------------------- */
795
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796/* Create a Unicode Object from the given Unicode code point ordinal.
797
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000798 The ordinal must be in range(0x10000) on narrow Python builds
799 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
800 raised in case it is not.
801
802*/
803
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000804PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000805
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000806/* --- Free-list management ----------------------------------------------- */
807
808/* Clear the free list used by the Unicode implementation.
809
810 This can be used to release memory used for objects on the free
811 list back to the Python memory allocator.
812
813*/
814
815PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
816
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000818
819 Many of these APIs take two arguments encoding and errors. These
820 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000821 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000822
Georg Brandl952867a2010-06-27 10:17:12 +0000823 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000824
825 Error handling is set by errors which may also be set to NULL
826 meaning to use the default handling defined for the codec. Default
827 error handling for all builtin codecs is "strict" (ValueErrors are
828 raised).
829
830 The codecs all use a similar interface. Only deviation from the
831 generic ones are documented.
832
833*/
834
Fred Drakecb093fe2000-05-09 19:51:53 +0000835/* --- Manage the default encoding ---------------------------------------- */
836
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000837/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000838 Unicode object unicode and the size of the encoded representation
839 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000840
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000841 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000842
Victor Stinner157f83f2011-09-28 21:41:31 +0200843 This funcation caches the UTF-8 encoded string in the unicodeobject
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 and subsequent calls will return the same string. The memory is relased
845 when the unicodeobject is deallocated.
846
847 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
848 support the previous internal function with the same behaviour.
849
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000850 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000851 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000852
853 *** If you need to access the Unicode object as UTF-8 bytes string,
854 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000855*/
856
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000857#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000859 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000860 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000862#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000863
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000864/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000865 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200867 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
868 in the unicodeobject.
869
870 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
871 support the previous internal function with the same behaviour.
872
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000873 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000874 extracted from the returned data.
875
876 *** This API is for interpreter INTERNAL USE ONLY and will likely
877 *** be removed or changed for Python 3.1.
878
879 *** If you need to access the Unicode object as UTF-8 bytes string,
880 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000881
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000882*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000883
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000884#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
886#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000887#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000888
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000889/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000890
Mark Hammond91a681d2002-08-12 07:21:58 +0000891PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000892
Guido van Rossumd8225182000-03-10 22:33:05 +0000893/* --- Generic Codecs ----------------------------------------------------- */
894
895/* Create a Unicode object by decoding the encoded string s of the
896 given size. */
897
Mark Hammond91a681d2002-08-12 07:21:58 +0000898PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000899 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000900 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000901 const char *encoding, /* encoding */
902 const char *errors /* error handling */
903 );
904
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000905/* Decode a Unicode object unicode and return the result as Python
906 object. */
907
908PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000909 PyObject *unicode, /* Unicode object */
910 const char *encoding, /* encoding */
911 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000912 );
913
914/* Decode a Unicode object unicode and return the result as Unicode
915 object. */
916
917PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000918 PyObject *unicode, /* Unicode object */
919 const char *encoding, /* encoding */
920 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000921 );
922
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000924 Python string object. */
925
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000926#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000927PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000928 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000929 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000930 const char *encoding, /* encoding */
931 const char *errors /* error handling */
932 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000933#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000934
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000935/* Encodes a Unicode object and returns the result as Python
936 object. */
937
938PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 PyObject *unicode, /* Unicode object */
940 const char *encoding, /* encoding */
941 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000942 );
943
Guido van Rossumd8225182000-03-10 22:33:05 +0000944/* Encodes a Unicode object and returns the result as Python string
945 object. */
946
Mark Hammond91a681d2002-08-12 07:21:58 +0000947PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000948 PyObject *unicode, /* Unicode object */
949 const char *encoding, /* encoding */
950 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000951 );
952
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000953/* Encodes a Unicode object and returns the result as Unicode
954 object. */
955
956PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000957 PyObject *unicode, /* Unicode object */
958 const char *encoding, /* encoding */
959 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000960 );
961
962/* Build an encoding map. */
963
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000964PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
965 PyObject* string /* 256 character map */
966 );
967
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968/* --- UTF-7 Codecs ------------------------------------------------------- */
969
Mark Hammond91a681d2002-08-12 07:21:58 +0000970PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000971 const char *string, /* UTF-7 encoded string */
972 Py_ssize_t length, /* size of string */
973 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 );
975
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000976PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 const char *string, /* UTF-7 encoded string */
978 Py_ssize_t length, /* size of string */
979 const char *errors, /* error handling */
980 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000981 );
982
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000983#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000984PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000985 const Py_UNICODE *data, /* Unicode char buffer */
986 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
987 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
988 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
989 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000990 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000991#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992
Guido van Rossumd8225182000-03-10 22:33:05 +0000993/* --- UTF-8 Codecs ------------------------------------------------------- */
994
Mark Hammond91a681d2002-08-12 07:21:58 +0000995PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000996 const char *string, /* UTF-8 encoded string */
997 Py_ssize_t length, /* size of string */
998 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 );
1000
Walter Dörwald69652032004-09-07 20:24:22 +00001001PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001002 const char *string, /* UTF-8 encoded string */
1003 Py_ssize_t length, /* size of string */
1004 const char *errors, /* error handling */
1005 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001006 );
1007
Mark Hammond91a681d2002-08-12 07:21:58 +00001008PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001010 );
1011
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001012#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1014 PyObject *unicode,
1015 const char *errors);
1016
Mark Hammond91a681d2002-08-12 07:21:58 +00001017PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001018 const Py_UNICODE *data, /* Unicode char buffer */
1019 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1020 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001022#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001023
Walter Dörwald41980ca2007-08-16 21:55:45 +00001024/* --- UTF-32 Codecs ------------------------------------------------------ */
1025
1026/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1027 the corresponding Unicode object.
1028
1029 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001031
1032 If byteorder is non-NULL, the decoder starts decoding using the
1033 given byte order:
1034
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001035 *byteorder == -1: little endian
1036 *byteorder == 0: native order
1037 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001038
1039 In native mode, the first four bytes of the stream are checked for a
1040 BOM mark. If found, the BOM mark is analysed, the byte order
1041 adjusted and the BOM skipped. In the other modes, no BOM mark
1042 interpretation is done. After completion, *byteorder is set to the
1043 current byte order at the end of input data.
1044
1045 If byteorder is NULL, the codec starts in native order mode.
1046
1047*/
1048
1049PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 const char *string, /* UTF-32 encoded string */
1051 Py_ssize_t length, /* size of string */
1052 const char *errors, /* error handling */
1053 int *byteorder /* pointer to byteorder to use
1054 0=native;-1=LE,1=BE; updated on
1055 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001056 );
1057
1058PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 const char *string, /* UTF-32 encoded string */
1060 Py_ssize_t length, /* size of string */
1061 const char *errors, /* error handling */
1062 int *byteorder, /* pointer to byteorder to use
1063 0=native;-1=LE,1=BE; updated on
1064 exit */
1065 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001066 );
1067
1068/* Returns a Python string using the UTF-32 encoding in native byte
1069 order. The string always starts with a BOM mark. */
1070
1071PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001073 );
1074
1075/* Returns a Python string object holding the UTF-32 encoded value of
1076 the Unicode data.
1077
1078 If byteorder is not 0, output is written according to the following
1079 byte order:
1080
1081 byteorder == -1: little endian
1082 byteorder == 0: native byte order (writes a BOM mark)
1083 byteorder == 1: big endian
1084
1085 If byteorder is 0, the output string will always start with the
1086 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1087 prepended.
1088
1089*/
1090
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001091#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001092PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001093 const Py_UNICODE *data, /* Unicode char buffer */
1094 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1095 const char *errors, /* error handling */
1096 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001097 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001098#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001099
Guido van Rossumd8225182000-03-10 22:33:05 +00001100/* --- UTF-16 Codecs ------------------------------------------------------ */
1101
Guido van Rossum9e896b32000-04-05 20:11:21 +00001102/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001103 the corresponding Unicode object.
1104
1105 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001107
1108 If byteorder is non-NULL, the decoder starts decoding using the
1109 given byte order:
1110
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 *byteorder == -1: little endian
1112 *byteorder == 0: native order
1113 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001114
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001115 In native mode, the first two bytes of the stream are checked for a
1116 BOM mark. If found, the BOM mark is analysed, the byte order
1117 adjusted and the BOM skipped. In the other modes, no BOM mark
1118 interpretation is done. After completion, *byteorder is set to the
1119 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001120
1121 If byteorder is NULL, the codec starts in native order mode.
1122
1123*/
1124
Mark Hammond91a681d2002-08-12 07:21:58 +00001125PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 const char *string, /* UTF-16 encoded string */
1127 Py_ssize_t length, /* size of string */
1128 const char *errors, /* error handling */
1129 int *byteorder /* pointer to byteorder to use
1130 0=native;-1=LE,1=BE; updated on
1131 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001132 );
1133
Walter Dörwald69652032004-09-07 20:24:22 +00001134PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 const char *string, /* UTF-16 encoded string */
1136 Py_ssize_t length, /* size of string */
1137 const char *errors, /* error handling */
1138 int *byteorder, /* pointer to byteorder to use
1139 0=native;-1=LE,1=BE; updated on
1140 exit */
1141 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001142 );
1143
Guido van Rossumd8225182000-03-10 22:33:05 +00001144/* Returns a Python string using the UTF-16 encoding in native byte
1145 order. The string always starts with a BOM mark. */
1146
Mark Hammond91a681d2002-08-12 07:21:58 +00001147PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001149 );
1150
1151/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001152 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001153
1154 If byteorder is not 0, output is written according to the following
1155 byte order:
1156
1157 byteorder == -1: little endian
1158 byteorder == 0: native byte order (writes a BOM mark)
1159 byteorder == 1: big endian
1160
1161 If byteorder is 0, the output string will always start with the
1162 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1163 prepended.
1164
1165 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1166 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001167 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001168
1169*/
1170
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001171#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001172PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 const Py_UNICODE *data, /* Unicode char buffer */
1174 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1175 const char *errors, /* error handling */
1176 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001177 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001178#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001179
1180/* --- Unicode-Escape Codecs ---------------------------------------------- */
1181
Mark Hammond91a681d2002-08-12 07:21:58 +00001182PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001183 const char *string, /* Unicode-Escape encoded string */
1184 Py_ssize_t length, /* size of string */
1185 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001186 );
1187
Mark Hammond91a681d2002-08-12 07:21:58 +00001188PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001190 );
1191
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001192#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001193PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001194 const Py_UNICODE *data, /* Unicode char buffer */
1195 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001196 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001197#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001198
1199/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1200
Mark Hammond91a681d2002-08-12 07:21:58 +00001201PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 const char *string, /* Raw-Unicode-Escape encoded string */
1203 Py_ssize_t length, /* size of string */
1204 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001205 );
1206
Mark Hammond91a681d2002-08-12 07:21:58 +00001207PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001209 );
1210
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001211#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001212PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001213 const Py_UNICODE *data, /* Unicode char buffer */
1214 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001215 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001216#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001217
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001218/* --- Unicode Internal Codec ---------------------------------------------
1219
1220 Only for internal use in _codecsmodule.c */
1221
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001222#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001223PyObject *_PyUnicode_DecodeUnicodeInternal(
1224 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001225 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001226 const char *errors
1227 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001228#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001229
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001231
1232 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1233
1234*/
1235
Mark Hammond91a681d2002-08-12 07:21:58 +00001236PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001237 const char *string, /* Latin-1 encoded string */
1238 Py_ssize_t length, /* size of string */
1239 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001240 );
1241
Mark Hammond91a681d2002-08-12 07:21:58 +00001242PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001244 );
1245
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001246#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1248 PyObject* unicode,
1249 const char* errors);
1250
Mark Hammond91a681d2002-08-12 07:21:58 +00001251PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 const Py_UNICODE *data, /* Unicode char buffer */
1253 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1254 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001255 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001256#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001257
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001259
1260 Only 7-bit ASCII data is excepted. All other codes generate errors.
1261
1262*/
1263
Mark Hammond91a681d2002-08-12 07:21:58 +00001264PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 const char *string, /* ASCII encoded string */
1266 Py_ssize_t length, /* size of string */
1267 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001268 );
1269
Mark Hammond91a681d2002-08-12 07:21:58 +00001270PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001272 );
1273
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001274#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1276 PyObject* unicode,
1277 const char* errors);
1278
Mark Hammond91a681d2002-08-12 07:21:58 +00001279PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 const Py_UNICODE *data, /* Unicode char buffer */
1281 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1282 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001283 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001284#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001285
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001287
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001289
1290 Decoding mappings must map single string characters to single
1291 Unicode characters, integers (which are then interpreted as Unicode
1292 ordinals) or None (meaning "undefined mapping" and causing an
1293 error).
1294
1295 Encoding mappings must map single Unicode characters to single
1296 string characters, integers (which are then interpreted as Latin-1
1297 ordinals) or None (meaning "undefined mapping" and causing an
1298 error).
1299
1300 If a character lookup fails with a LookupError, the character is
1301 copied as-is meaning that its ordinal value will be interpreted as
1302 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1303 to contain those mappings which map characters to different code
1304 points.
1305
1306*/
1307
Mark Hammond91a681d2002-08-12 07:21:58 +00001308PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001309 const char *string, /* Encoded string */
1310 Py_ssize_t length, /* size of string */
1311 PyObject *mapping, /* character mapping
1312 (char ordinal -> unicode ordinal) */
1313 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001314 );
1315
Mark Hammond91a681d2002-08-12 07:21:58 +00001316PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 PyObject *unicode, /* Unicode object */
1318 PyObject *mapping /* character mapping
1319 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001320 );
1321
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001323PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324 const Py_UNICODE *data, /* Unicode char buffer */
1325 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1326 PyObject *mapping, /* character mapping
1327 (unicode ordinal -> char ordinal) */
1328 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001329 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001330#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001331
1332/* Translate a Py_UNICODE buffer of the given length by applying a
1333 character mapping table to it and return the resulting Unicode
1334 object.
1335
1336 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001338
1339 Mapping tables may be dictionaries or sequences. Unmapped character
1340 ordinals (ones which cause a LookupError) are left untouched and
1341 are copied as-is.
1342
1343*/
1344
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001345#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001346PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001347 const Py_UNICODE *data, /* Unicode char buffer */
1348 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1349 PyObject *table, /* Translate table */
1350 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001351 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001352#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001353
Victor Stinner99b95382011-07-04 14:23:54 +02001354#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001355
Guido van Rossumefec1152000-03-28 02:01:15 +00001356/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001357
Mark Hammond91a681d2002-08-12 07:21:58 +00001358PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001359 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001361 const char *errors /* error handling */
1362 );
1363
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001364PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1365 const char *string, /* MBCS encoded string */
1366 Py_ssize_t length, /* size of string */
1367 const char *errors, /* error handling */
1368 Py_ssize_t *consumed /* bytes consumed */
1369 );
1370
Mark Hammond91a681d2002-08-12 07:21:58 +00001371PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001372 PyObject *unicode /* Unicode object */
1373 );
1374
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001375#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001376PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001377 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001378 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001379 const char *errors /* error handling */
1380 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001381#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001382
Victor Stinner99b95382011-07-04 14:23:54 +02001383#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001384
Guido van Rossum9e896b32000-04-05 20:11:21 +00001385/* --- Decimal Encoder ---------------------------------------------------- */
1386
1387/* Takes a Unicode string holding a decimal value and writes it into
1388 an output buffer using standard ASCII digit codes.
1389
1390 The output buffer has to provide at least length+1 bytes of storage
1391 area. The output string is 0-terminated.
1392
1393 The encoder converts whitespace to ' ', decimal characters to their
1394 corresponding ASCII digit and all other Latin-1 characters except
1395 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1396 are treated as errors. This includes embedded NULL bytes.
1397
1398 Error handling is defined by the errors argument:
1399
1400 NULL or "strict": raise a ValueError
1401 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001403 "replace": replaces illegal characters with '?'
1404
1405 Returns 0 on success, -1 on failure.
1406
1407*/
1408
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001409#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001410PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 Py_UNICODE *s, /* Unicode buffer */
1412 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1413 char *output, /* Output buffer; must have size >= length */
1414 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001415 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001416#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001417
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001418/* Transforms code points that have decimal digit property to the
1419 corresponding ASCII digit code points.
1420
1421 Returns a new Unicode string on success, NULL on failure.
1422*/
1423
Georg Brandlb5503082010-12-05 11:40:48 +00001424#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001425PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1426 Py_UNICODE *s, /* Unicode buffer */
1427 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1428 );
Georg Brandlb5503082010-12-05 11:40:48 +00001429#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1432 as argument instead of a raw buffer and length. This function additionally
1433 transforms spaces to ASCII because this is what the callers in longobject,
1434 floatobject, and complexobject did anyways. */
1435
1436#ifndef Py_LIMITED_API
1437PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1438 PyObject *unicode /* Unicode object */
1439 );
1440#endif
1441
Martin v. Löwis011e8422009-05-05 04:43:17 +00001442/* --- File system encoding ---------------------------------------------- */
1443
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001444/* ParseTuple converter: encode str objects to bytes using
1445 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001446
1447PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1448
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001449/* ParseTuple converter: decode bytes objects to unicode using
1450 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1451
1452PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1453
Victor Stinner77c38622010-05-14 15:58:55 +00001454/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1455 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001456
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001457 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1458 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001459
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001460 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001461*/
1462
1463PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1464 const char *s /* encoded string */
1465 );
1466
Victor Stinner77c38622010-05-14 15:58:55 +00001467/* Decode a string using Py_FileSystemDefaultEncoding
1468 and the "surrogateescape" error handler.
1469
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001470 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1471 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001472*/
1473
Martin v. Löwis011e8422009-05-05 04:43:17 +00001474PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1475 const char *s, /* encoded string */
1476 Py_ssize_t size /* size */
1477 );
1478
Victor Stinnerae6265f2010-05-15 16:27:27 +00001479/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001480 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001481
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001482 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1483 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001484*/
1485
1486PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1487 PyObject *unicode
1488 );
1489
Guido van Rossumd8225182000-03-10 22:33:05 +00001490/* --- Methods & Slots ----------------------------------------------------
1491
1492 These are capable of handling Unicode objects and strings on input
1493 (we refer to them as strings in the descriptions) and return
1494 Unicode objects or integers as apporpriate. */
1495
1496/* Concat two strings giving a new Unicode string. */
1497
Mark Hammond91a681d2002-08-12 07:21:58 +00001498PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 PyObject *left, /* Left string */
1500 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001501 );
1502
Walter Dörwald1ab83302007-05-18 17:15:44 +00001503/* Concat two strings and put the result in *pleft
1504 (sets *pleft to NULL on error) */
1505
1506PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 PyObject **pleft, /* Pointer to left string */
1508 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001509 );
1510
1511/* Concat two strings, put the result in *pleft and drop the right object
1512 (sets *pleft to NULL on error) */
1513
1514PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001515 PyObject **pleft, /* Pointer to left string */
1516 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001517 );
1518
Guido van Rossumd8225182000-03-10 22:33:05 +00001519/* Split a string giving a list of Unicode strings.
1520
1521 If sep is NULL, splitting will be done at all whitespace
1522 substrings. Otherwise, splits occur at the given separator.
1523
1524 At most maxsplit splits will be done. If negative, no limit is set.
1525
1526 Separators are not included in the resulting list.
1527
1528*/
1529
Mark Hammond91a681d2002-08-12 07:21:58 +00001530PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 PyObject *s, /* String to split */
1532 PyObject *sep, /* String separator */
1533 Py_ssize_t maxsplit /* Maxsplit count */
1534 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001535
1536/* Dito, but split at line breaks.
1537
1538 CRLF is considered to be one line break. Line breaks are not
1539 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001540
Mark Hammond91a681d2002-08-12 07:21:58 +00001541PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001542 PyObject *s, /* String to split */
1543 int keepends /* If true, line end markers are included */
1544 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001545
Thomas Wouters477c8d52006-05-27 19:21:47 +00001546/* Partition a string using a given separator. */
1547
1548PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 PyObject *s, /* String to partition */
1550 PyObject *sep /* String separator */
1551 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001552
1553/* Partition a string using a given separator, searching from the end of the
1554 string. */
1555
1556PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001557 PyObject *s, /* String to partition */
1558 PyObject *sep /* String separator */
1559 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001560
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001561/* Split a string giving a list of Unicode strings.
1562
1563 If sep is NULL, splitting will be done at all whitespace
1564 substrings. Otherwise, splits occur at the given separator.
1565
1566 At most maxsplit splits will be done. But unlike PyUnicode_Split
1567 PyUnicode_RSplit splits from the end of the string. If negative,
1568 no limit is set.
1569
1570 Separators are not included in the resulting list.
1571
1572*/
1573
1574PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 PyObject *s, /* String to split */
1576 PyObject *sep, /* String separator */
1577 Py_ssize_t maxsplit /* Maxsplit count */
1578 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001579
Guido van Rossumd8225182000-03-10 22:33:05 +00001580/* Translate a string by applying a character mapping table to it and
1581 return the resulting Unicode object.
1582
1583 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001584 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001585
1586 Mapping tables may be dictionaries or sequences. Unmapped character
1587 ordinals (ones which cause a LookupError) are left untouched and
1588 are copied as-is.
1589
1590*/
1591
Mark Hammond91a681d2002-08-12 07:21:58 +00001592PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001593 PyObject *str, /* String */
1594 PyObject *table, /* Translate table */
1595 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001596 );
1597
1598/* Join a sequence of strings using the given separator and return
1599 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001600
Mark Hammond91a681d2002-08-12 07:21:58 +00001601PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001602 PyObject *separator, /* Separator string */
1603 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001604 );
1605
1606/* Return 1 if substr matches str[start:end] at the given tail end, 0
1607 otherwise. */
1608
Martin v. Löwis18e16552006-02-15 17:27:45 +00001609PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001610 PyObject *str, /* String */
1611 PyObject *substr, /* Prefix or Suffix string */
1612 Py_ssize_t start, /* Start index */
1613 Py_ssize_t end, /* Stop index */
1614 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001615 );
1616
1617/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001618 given search direction or -1 if not found. -2 is returned in case
1619 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001620
Martin v. Löwis18e16552006-02-15 17:27:45 +00001621PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 PyObject *str, /* String */
1623 PyObject *substr, /* Substring to find */
1624 Py_ssize_t start, /* Start index */
1625 Py_ssize_t end, /* Stop index */
1626 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001627 );
1628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629/* Like PyUnicode_Find, but search for single character only. */
1630PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1631 PyObject *str,
1632 Py_UCS4 ch,
1633 Py_ssize_t start,
1634 Py_ssize_t end,
1635 int direction
1636 );
1637
Barry Warsaw51ac5802000-03-20 16:36:48 +00001638/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001639
Martin v. Löwis18e16552006-02-15 17:27:45 +00001640PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001641 PyObject *str, /* String */
1642 PyObject *substr, /* Substring to count */
1643 Py_ssize_t start, /* Start index */
1644 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001645 );
1646
Barry Warsaw51ac5802000-03-20 16:36:48 +00001647/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001648 and return the resulting Unicode object. */
1649
Mark Hammond91a681d2002-08-12 07:21:58 +00001650PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001651 PyObject *str, /* String */
1652 PyObject *substr, /* Substring to find */
1653 PyObject *replstr, /* Substring to replace */
1654 Py_ssize_t maxcount /* Max. number of replacements to apply;
1655 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001656 );
1657
1658/* Compare two strings and return -1, 0, 1 for less than, equal,
1659 greater than resp. */
1660
Mark Hammond91a681d2002-08-12 07:21:58 +00001661PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001662 PyObject *left, /* Left string */
1663 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001664 );
1665
Martin v. Löwis5b222132007-06-10 09:51:05 +00001666PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1667 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001668 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001669 );
1670
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001671/* Rich compare two strings and return one of the following:
1672
1673 - NULL in case an exception was raised
1674 - Py_True or Py_False for successfuly comparisons
1675 - Py_NotImplemented in case the type combination is unknown
1676
1677 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1678 case the conversion of the arguments to Unicode fails with a
1679 UnicodeDecodeError.
1680
1681 Possible values for op:
1682
1683 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1684
1685*/
1686
1687PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001688 PyObject *left, /* Left string */
1689 PyObject *right, /* Right string */
1690 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001691 );
1692
Thomas Wouters7e474022000-07-16 12:04:32 +00001693/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001694 the resulting Unicode string. */
1695
Mark Hammond91a681d2002-08-12 07:21:58 +00001696PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001697 PyObject *format, /* Format string */
1698 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001699 );
1700
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001701/* Checks whether element is contained in container and return 1/0
1702 accordingly.
1703
1704 element has to coerce to an one element Unicode string. -1 is
1705 returned in case of an error. */
1706
Mark Hammond91a681d2002-08-12 07:21:58 +00001707PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 PyObject *container, /* Container string */
1709 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001710 );
1711
Martin v. Löwis47383402007-08-15 07:32:56 +00001712/* Checks whether argument is a valid identifier. */
1713
1714PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1715
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001716#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001717/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001718PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001719 PyUnicodeObject *self,
1720 int striptype,
1721 PyObject *sepobj
1722 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001723#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001724
Eric Smith5807c412008-05-11 21:00:57 +00001725/* Using the current locale, insert the thousands grouping
1726 into the string pointed to by buffer. For the argument descriptions,
1727 see Objects/stringlib/localeutil.h */
1728
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001729#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001730PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1731 Py_ssize_t n_buffer,
1732 Py_UNICODE *digits,
1733 Py_ssize_t n_digits,
1734 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001735#endif
Eric Smith5807c412008-05-11 21:00:57 +00001736
Eric Smitha3b1ac82009-04-03 14:45:06 +00001737/* Using explicit passed-in values, insert the thousands grouping
1738 into the string pointed to by buffer. For the argument descriptions,
1739 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001740#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1742 int kind,
1743 void *buffer,
1744 Py_ssize_t n_buffer,
1745 void *digits,
1746 Py_ssize_t n_digits,
1747 Py_ssize_t min_width,
1748 const char *grouping,
1749 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001750#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001751/* === Characters Type APIs =============================================== */
1752
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001753/* Helper array used by Py_UNICODE_ISSPACE(). */
1754
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001755#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001756PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1757
Guido van Rossumd8225182000-03-10 22:33:05 +00001758/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001759 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001760
1761 These APIs are implemented in Objects/unicodectype.c.
1762
1763*/
1764
Mark Hammond91a681d2002-08-12 07:21:58 +00001765PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001766 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001767 );
1768
Mark Hammond91a681d2002-08-12 07:21:58 +00001769PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001770 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001771 );
1772
Mark Hammond91a681d2002-08-12 07:21:58 +00001773PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001774 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001775 );
1776
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001777PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001778 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001779 );
1780
1781PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001782 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001783 );
1784
Mark Hammond91a681d2002-08-12 07:21:58 +00001785PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001786 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001787 );
1788
Mark Hammond91a681d2002-08-12 07:21:58 +00001789PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001790 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001791 );
1792
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001793PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1794 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001795 );
1796
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001797PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1798 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001799 );
1800
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001801PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1802 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001803 );
1804
Mark Hammond91a681d2002-08-12 07:21:58 +00001805PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001806 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001807 );
1808
Mark Hammond91a681d2002-08-12 07:21:58 +00001809PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001810 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001811 );
1812
Mark Hammond91a681d2002-08-12 07:21:58 +00001813PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001814 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001815 );
1816
Mark Hammond91a681d2002-08-12 07:21:58 +00001817PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001818 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001819 );
1820
Mark Hammond91a681d2002-08-12 07:21:58 +00001821PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001822 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001823 );
1824
Mark Hammond91a681d2002-08-12 07:21:58 +00001825PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001826 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001827 );
1828
Georg Brandl559e5d72008-06-11 18:37:52 +00001829PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001830 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001831 );
1832
Mark Hammond91a681d2002-08-12 07:21:58 +00001833PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001834 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001835 );
1836
Victor Stinneref8d95c2010-08-16 22:03:11 +00001837PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1838 const Py_UNICODE *u
1839 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001840
1841PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001842 Py_UNICODE *s1,
1843 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001844
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001845PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1846 Py_UNICODE *s1, const Py_UNICODE *s2);
1847
Martin v. Löwis5b222132007-06-10 09:51:05 +00001848PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001849 Py_UNICODE *s1,
1850 const Py_UNICODE *s2,
1851 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001852
1853PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001854 const Py_UNICODE *s1,
1855 const Py_UNICODE *s2
1856 );
1857
1858PyAPI_FUNC(int) Py_UNICODE_strncmp(
1859 const Py_UNICODE *s1,
1860 const Py_UNICODE *s2,
1861 size_t n
1862 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001863
1864PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001865 const Py_UNICODE *s,
1866 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001867 );
1868
Victor Stinner331ea922010-08-10 16:37:20 +00001869PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001870 const Py_UNICODE *s,
1871 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001872 );
1873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874PyAPI_FUNC(size_t) Py_UCS4_strlen(
1875 const Py_UCS4 *u
1876 );
1877
1878PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1879 Py_UCS4 *s1,
1880 const Py_UCS4 *s2);
1881
1882PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1883 Py_UCS4 *s1, const Py_UCS4 *s2);
1884
1885PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1886 Py_UCS4 *s1,
1887 const Py_UCS4 *s2,
1888 size_t n);
1889
1890PyAPI_FUNC(int) Py_UCS4_strcmp(
1891 const Py_UCS4 *s1,
1892 const Py_UCS4 *s2
1893 );
1894
1895PyAPI_FUNC(int) Py_UCS4_strncmp(
1896 const Py_UCS4 *s1,
1897 const Py_UCS4 *s2,
1898 size_t n
1899 );
1900
1901PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1902 const Py_UCS4 *s,
1903 Py_UCS4 c
1904 );
1905
1906PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1907 const Py_UCS4 *s,
1908 Py_UCS4 c
1909 );
1910
Victor Stinner71133ff2010-09-01 23:43:53 +00001911/* Create a copy of a unicode string ending with a nul character. Return NULL
1912 and raise a MemoryError exception on memory allocation failure, otherwise
1913 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1914
Victor Stinner46408602010-09-03 16:18:00 +00001915PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001916 PyObject *unicode
1917 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001918#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001919
Guido van Rossumd8225182000-03-10 22:33:05 +00001920#ifdef __cplusplus
1921}
1922#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001923#endif /* !Py_UNICODEOBJECT_H */