blob: dbd4fd8ad01cfee7f686d29b734f58b28e1f9661 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020067#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000069#endif
70
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020071#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000076
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000081/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000082 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020086/* Py_UNICODE was the native Unicode storage format (code unit) used by
87 Python and represents a single Unicode element in the Unicode type.
88 With PEP 393, Py_UNICODE is deprected and replaced with a
89 typedef to wchar_t. */
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020091#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +000094#endif
95
96/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +000097 through the interface functions PyUnicode_FromWideChar(),
98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +000099
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106#if defined(MS_WINDOWS)
Victor Stinner99b95382011-07-04 14:23:54 +0200107# define HAVE_MBCS
108#endif
109
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113# include <time.h>
114# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000115# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000116#endif
117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119 unicode representations. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000122#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123typedef unsigned long Py_UCS4;
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000124#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200125#error "Could not find a proper typedef for Py_UCS4"
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126#endif
127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000133/* Since splitting on whitespace is an important use case, and
134 whitespace in most situations is solely ASCII whitespace, we
135 optimize for the common case by using a quick look-up table
136 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000138 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000139#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000140#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALNUM(ch) \
164 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 Py_UNICODE_ISDECIMAL(ch) || \
166 Py_UNICODE_ISDIGIT(ch) || \
167 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200169#define Py_UNICODE_COPY(target, source, length) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000171
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000172#define Py_UNICODE_FILL(target, value, length) \
173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000175 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000176
Ezio Melotti8c9375b2011-08-22 20:03:25 +0300177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low) \
183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000186/* Check if substring matches at given offset. The offset must be
187 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000188
Thomas Wouters477c8d52006-05-27 19:21:47 +0000189#define Py_UNICODE_MATCH(string, offset, substring) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000194#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000202#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205 structure. state.ascii and state.compact are set, and the data
206 immediately follow the structure. utf8_length and wstr_length can be found
207 in the length field; the utf8 pointer is equal to the data pointer. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000208typedef struct {
209 PyObject_HEAD
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200210 Py_ssize_t length; /* Number of code points in the string */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000211 Py_hash_t hash; /* Hash value; -1 if not set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200212 struct {
213 /*
214 SSTATE_NOT_INTERNED (0)
215 SSTATE_INTERNED_MORTAL (1)
216 SSTATE_INTERNED_IMMORTAL (2)
217
218 If interned != SSTATE_NOT_INTERNED, the two references from the
219 dictionary to this object are *not* counted in ob_refcnt.
220 */
221 unsigned int interned:2;
222 /* Character size:
223
224 PyUnicode_WCHAR_KIND (0): wchar_t*
225 PyUnicode_1BYTE_KIND (1): Py_UCS1*
226 PyUnicode_2BYTE_KIND (2): Py_UCS2*
227 PyUnicode_4BYTE_KIND (3): Py_UCS4*
228 */
229 unsigned int kind:2;
230 /* Compact is with respect to the allocation scheme. Compact unicode
231 objects only require one memory block while non-compact objects use
232 one block for the PyUnicodeObject struct and another for its data
233 buffer. */
234 unsigned int compact:1;
235 /* Compact objects which are ASCII-only also have the state.compact
236 flag set, and use the PyASCIIObject struct. */
237 unsigned int ascii:1;
238 /* The ready flag indicates whether the object layout is initialized
239 completely. This means that this is either a compact object, or
240 the data pointer is filled out. The bit is redundant, and helps
241 to minimize the test in PyUnicode_IS_READY(). */
242 unsigned int ready:1;
243 } state;
244 wchar_t *wstr; /* wchar_t representation (null-terminated) */
245} PyASCIIObject;
246
247/* Non-ASCII strings allocated through PyUnicode_New use the
248 PyCompactUnicodeOject structure. state.compact is set, and the data
249 immediately follow the structure. */
250typedef struct {
251 PyASCIIObject _base;
252 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
253 * terminating \0. */
254 char *utf8; /* UTF-8 representation (null-terminated) */
255 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
256 * surrogates count as two code points. */
257} PyCompactUnicodeObject;
258
259/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
260 PyUnicodeObject structure. The actual string data is initially in the wstr
261 block, and copied into the data block using PyUnicode_Ready. */
262typedef struct {
263 PyCompactUnicodeObject _base;
264 union {
265 void *any;
266 Py_UCS1 *latin1;
267 Py_UCS2 *ucs2;
268 Py_UCS4 *ucs4;
269 } data; /* Canonical, smallest-form Unicode buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000270} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000271#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000272
Mark Hammond91a681d2002-08-12 07:21:58 +0000273PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000274PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000275
Thomas Wouters27d517b2007-02-25 20:39:11 +0000276#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000277 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
278#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000279
280/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000281#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282
283#define PyUnicode_WSTR_LENGTH(op) \
284 (((PyASCIIObject*)op)->state.ascii ? \
285 ((PyASCIIObject*)op)->length : \
286 ((PyCompactUnicodeObject*)op)->wstr_length)
287
288/* Returns the deprecated Py_UNICODE representation's size in code units
289 (this includes surrogate pairs as 2 units).
290 If the Py_UNICODE representation is not available, it will be computed
291 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
292
Guido van Rossumd8225182000-03-10 22:33:05 +0000293#define PyUnicode_GET_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200294 (assert(PyUnicode_Check(op)), \
295 (((PyASCIIObject *)(op))->wstr) ? \
296 PyUnicode_WSTR_LENGTH(op) : \
297 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
298 PyUnicode_WSTR_LENGTH(op)))
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
302
303/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
304 representation on demand. Using this macro is very inefficient now,
305 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
306 use PyUnicode_WRITE() and PyUnicode_READ(). */
307
Guido van Rossumd8225182000-03-10 22:33:05 +0000308#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 (assert(PyUnicode_Check(op)), \
310 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
311 PyUnicode_AsUnicode((PyObject *)(op)))
312
Guido van Rossumd8225182000-03-10 22:33:05 +0000313#define PyUnicode_AS_DATA(op) \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 ((const char *)(PyUnicode_AS_UNICODE(op)))
315
316
317/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
318
319/* Values for PyUnicodeObject.state: */
320
321/* Interning state. */
322#define SSTATE_NOT_INTERNED 0
323#define SSTATE_INTERNED_MORTAL 1
324#define SSTATE_INTERNED_IMMORTAL 2
325
326#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
327
328/* String contains only wstr byte characters. This is only possible
329 when the string was created with a legacy API and PyUnicode_Ready()
330 has not been called yet. */
331#define PyUnicode_WCHAR_KIND 0
332
333/* Return values of the PyUnicode_KIND() macro: */
334
335#define PyUnicode_1BYTE_KIND 1
336#define PyUnicode_2BYTE_KIND 2
337#define PyUnicode_4BYTE_KIND 3
338
339
340/* Return the number of bytes the string uses to represent single characters,
341 this can be 1, 2 or 4. */
342#define PyUnicode_CHARACTER_SIZE(op) \
343 (1 << (PyUnicode_KIND(op) - 1))
344
345/* Return pointers to the canonical representation casted as unsigned char,
346 Py_UCS2, or Py_UCS4 for direct character access.
347 No checks are performed, use PyUnicode_CHARACTER_SIZE or
348 PyUnicode_KIND() before to ensure these will work correctly. */
349
350#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
351#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
352#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
353
354/* Return true if the string is compact or 0 if not.
355 No type checks or Ready calls are performed. */
356#define PyUnicode_IS_COMPACT(op) \
357 (((PyASCIIObject*)(op))->state.compact)
358
359/* Return one of the PyUnicode_*_KIND values defined above. */
360#define PyUnicode_KIND(op) \
361 (assert(PyUnicode_Check(op)), \
362 assert(PyUnicode_IS_READY(op)), \
363 ((PyASCIIObject *)(op))->state.kind)
364
365/* Return a void pointer to the raw unicode buffer. */
366#define _PyUnicode_COMPACT_DATA(op) \
367 (PyUnicode_IS_COMPACT_ASCII(op) ? \
368 ((void*)((PyASCIIObject*)(op) + 1)) : \
369 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
370
371#define _PyUnicode_NONCOMPACT_DATA(op) \
372 (assert(((PyUnicodeObject*)(op))->data.any), \
373 ((((PyUnicodeObject *)(op))->data.any)))
374
375#define PyUnicode_DATA(op) \
376 (assert(PyUnicode_Check(op)), \
377 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
378 _PyUnicode_NONCOMPACT_DATA(op))
379
380#define _PyUnicode_UTF8(op) \
381 (PyUnicode_IS_COMPACT_ASCII(op) ? \
382 ((char*)((PyASCIIObject*)(op) + 1)) : \
383 ((PyCompactUnicodeObject*)(op))->utf8)
384
385#define _PyUnicode_UTF8_LENGTH(op) \
386 (PyUnicode_IS_COMPACT_ASCII(op) ? \
387 ((PyASCIIObject*)(op))->length : \
388 ((PyCompactUnicodeObject*)(op))->utf8_length)
389
390/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
391
392 The index is a character index, the result is a size in bytes. */
393#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
394
395/* In the access macros below, "kind" may be evaluated more than once.
396 All other macro parameters are evaluated exactly once, so it is safe
397 to put side effects into them (such as increasing the index). */
398
399/* Write into the canonical representation, this macro does not do any sanity
400 checks and is intended for usage in loops. The caller should cache the
401 kind and data pointers optained form other macro calls.
402 index is the index in the string (starts at 0) and value is the new
403 code point value which shoule be written to that location. */
404#define PyUnicode_WRITE(kind, data, index, value) \
405 do { \
406 switch ((kind)) { \
407 case PyUnicode_1BYTE_KIND: { \
408 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
409 break; \
410 } \
411 case PyUnicode_2BYTE_KIND: { \
412 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
413 break; \
414 } \
415 default: { \
416 assert((kind) == PyUnicode_4BYTE_KIND); \
417 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
418 } \
419 } \
420 } while (0)
421
422/* Read a code point form the string's canonical representation. No checks
423 or ready calls are performed. */
424#define PyUnicode_READ(kind, data, index) \
425 ((Py_UCS4) \
426 ((kind) == PyUnicode_1BYTE_KIND ? \
427 ((const unsigned char *)(data))[(index)] : \
428 ((kind) == PyUnicode_2BYTE_KIND ? \
429 ((const Py_UCS2 *)(data))[(index)] : \
430 ((const Py_UCS4 *)(data))[(index)] \
431 ) \
432 ))
433
434/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
435 calls PyUnicode_KIND() and might call it twice. For single reads, use
436 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
437 cache kind and use PyUnicode_READ instead. */
438#define PyUnicode_READ_CHAR(unicode, index) \
439 ((Py_UCS4) \
440 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
441 ((const unsigned char *)(PyUnicode_DATA((unicode))))[(index)] : \
442 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
443 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
444 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
445 ) \
446 ))
447
448/* Returns the length of the unicode string. The caller has to make sure that
449 the string has it's canonical representation set before calling
450 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
451#define PyUnicode_GET_LENGTH(op) \
452 (assert(PyUnicode_Check(op)), \
453 assert(PyUnicode_IS_READY(op)), \
454 ((PyASCIIObject *)(op))->length)
455
456
457/* Fast check to determine whether an object is ready. Equivalent to
458 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
459
460#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
461
462/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
463 case. If the canonical representation is not yet set, it will still call
464 PyUnicode_Ready().
465 Returns 0 on success and -1 on errors. */
466#define PyUnicode_READY(op) \
467 (assert(PyUnicode_Check(op)), \
468 (PyUnicode_IS_READY(op) ? \
469 0 : _PyUnicode_Ready((PyUnicodeObject *)(op))))
470
471/* Generic helper macro to convert characters of different types.
472 from_type and to_type have to be valid type names, begin and end
473 are pointers to the source characters which should be of type
474 "from_type *". to is a pointer of type "to_type *" and points to the
475 buffer where the result characters are written to. */
476#define PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
477 do { \
478 const from_type *iter_; to_type *to_; \
479 for (iter_ = (begin), to_ = (to_type *)(to); \
480 iter_ < (end); \
481 ++iter_, ++to_) { \
482 *to_ = (to_type)*iter_; \
483 } \
484 } while (0)
485
486/* Return a maximum character value which is suitable for creating another
487 string based on op. This is always an approximation but more efficient
488 than interating over the string. */
489#define PyUnicode_MAX_CHAR_VALUE(op) \
490 (assert(PyUnicode_IS_READY(op)), \
491 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
492 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
493 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
494 (0x7fU) : (0xffU) \
495 ) : \
496 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
497 (0xffffU) : (0x10ffffU) \
498 ))))
499
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000500#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000501
502/* --- Constants ---------------------------------------------------------- */
503
504/* This Unicode character will be used as replacement character during
505 decoding if the errors argument is set to "replace". Note: the
506 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
507 Unicode 3.0. */
508
Victor Stinner5ce1b0d2011-09-28 20:29:27 +0200509#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
Guido van Rossumd8225182000-03-10 22:33:05 +0000510
511/* === Public API ========================================================= */
512
513/* --- Plain Py_UNICODE --------------------------------------------------- */
514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200515/* With PEP 393, this is the recommended way to allocate a new unicode object.
516 This function will allocate the object and its buffer in a single memory
517 block. Objects created using this function are not resizable. */
518#ifndef Py_LIMITED_API
519PyAPI_FUNC(PyObject*) PyUnicode_New(
520 Py_ssize_t size, /* Number of code points in the new string */
521 Py_UCS4 maxchar /* maximum code point value in the string */
522 );
523#endif
524
525/* Initializes the canonical string representation from a the deprected
526 wstr/Py_UNICODE representation. This function is used to convert
527 unicode objects which were created using the old API to the new flexible
528 format introduced with PEP 393. The PyUnicode_READY() macro can be
529 more efficient if the string is already ready. */
530#ifndef Py_LIMITED_API
531PyAPI_FUNC(int) _PyUnicode_Ready(
532 PyUnicodeObject *unicode /* Unicode object */
533 );
534#endif
535
536/* Copy character from one unicode object into another, this function performs
537 character conversion when nessesary and falls back to memcpy if possible.
538 Return -1 and raise an exception on error, return 0 on success. */
539#ifndef Py_LIMITED_API
540PyAPI_FUNC(int) PyUnicode_CopyCharacters(
541 PyObject *to,
542 Py_ssize_t to_start,
543 PyObject *from,
544 Py_ssize_t from_start,
545 Py_ssize_t how_many
546 );
547#endif
548
549/* Find the maximum code point and count the number of surrogate pairs so a
550 correct string length can be computed before converting a string to UCS4.
551 This function counts single surrogates as a character and not as a pair. */
552#ifndef Py_LIMITED_API
553PyAPI_FUNC(int) _PyUnicode_FindMaxCharAndNumSurrogatePairs(
554 const wchar_t *begin,
555 const wchar_t *end,
556 Py_UCS4 *maxchar,
557 Py_ssize_t *num_surrogates
558 );
559#endif
560
Guido van Rossumd8225182000-03-10 22:33:05 +0000561/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000562 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000563
564 u may be NULL which causes the contents to be undefined. It is the
565 user's responsibility to fill in the needed data afterwards. Note
566 that modifying the Unicode object contents after construction is
567 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000568
569 The buffer is copied into the new object. */
570
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000571#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000572PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000573 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000574 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000575 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000576#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000577
Georg Brandl952867a2010-06-27 10:17:12 +0000578/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000579PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000580 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000581 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000582 );
583
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000584/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200585 UTF-8 encoded bytes. The size is determined with strlen(). */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000586PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000587 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000588 );
589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590#ifndef Py_LIMITED_API
591PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
592 int kind,
593 const void *buffer,
594 Py_ssize_t size);
595#endif
596
597PyAPI_FUNC(PyObject*) PyUnicode_Substring(
598 PyObject *str,
599 Py_ssize_t start,
600 Py_ssize_t end);
601
602/* Copy the string into a UCS4 buffer including the null character is copy_null
603 is set. Return NULL and raise an exception on error. Raise a ValueError if
604 the buffer is smaller than the string. Return buffer on success.
605
606 buflen is the length of the buffer in (Py_UCS4) characters. */
607PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
608 PyObject *unicode,
609 Py_UCS4* buffer,
610 Py_ssize_t buflen,
611 int copy_null);
612
613/* Copy the string into a UCS4 buffer. A new buffer is allocated using
614 * PyMem_Malloc; if this fails, NULL is returned with a memory error
615 exception set. */
616PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
617
Guido van Rossumd8225182000-03-10 22:33:05 +0000618/* Return a read-only pointer to the Unicode object's internal
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619 Py_UNICODE buffer.
620 If the wchar_t/Py_UNICODE representation is not yet available, this
621 function will calculate it. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000622
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000623#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000624PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000626 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000627#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629/* Return a read-only pointer to the Unicode object's internal
630 Py_UNICODE buffer and save the length at size.
631 If the wchar_t/Py_UNICODE representation is not yet available, this
632 function will calculate it. */
633
634#ifndef Py_LIMITED_API
635PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
636 PyObject *unicode, /* Unicode object */
637 Py_ssize_t *size /* location where to save the length */
638 );
639#endif
640
Guido van Rossumd8225182000-03-10 22:33:05 +0000641/* Get the length of the Unicode object. */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
644 PyObject *unicode
645);
646
647/* Get the number of Py_UNICODE units in the
648 string representation. */
649
Martin v. Löwis18e16552006-02-15 17:27:45 +0000650PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000652 );
653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654/* Read a character from the string. */
655
656PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
657 PyObject *unicode,
658 Py_ssize_t index
659 );
660
661/* Write a character to the string. The string must have been created through
662 PyUnicode_New, must not be shared, and must not have been hashed yet. */
663
664PyAPI_FUNC(int) PyUnicode_WriteChar(
665 PyObject *unicode,
666 Py_ssize_t index,
667 Py_UCS4 character
668 );
669
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000670#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000671/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000672PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000673#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000674
Guido van Rossum52c23592000-04-10 13:41:41 +0000675/* Resize an already allocated Unicode object to the new size length.
676
677 *unicode is modified to point to the new (resized) object and 0
678 returned on success.
679
680 This API may only be called by the function which also called the
681 Unicode constructor. The refcount on the object must be 1. Otherwise,
682 an error is returned.
683
684 Error handling is implemented as follows: an exception is set, -1
685 is returned and *unicode left untouched.
686
687*/
688
Mark Hammond91a681d2002-08-12 07:21:58 +0000689PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 PyObject **unicode, /* Pointer to the Unicode object */
691 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000692 );
693
Guido van Rossumd8225182000-03-10 22:33:05 +0000694/* Coerce obj to an Unicode object and return a reference with
695 *incremented* refcount.
696
697 Coercion is done in the following way:
698
Georg Brandl952867a2010-06-27 10:17:12 +0000699 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000700 under the assumptions that they contain data using the UTF-8
701 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000702
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000703 2. All other objects (including Unicode objects) raise an
704 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000705
706 The API returns NULL in case of an error. The caller is responsible
707 for decref'ing the returned objects.
708
709*/
710
Mark Hammond91a681d2002-08-12 07:21:58 +0000711PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000713 const char *encoding, /* encoding */
714 const char *errors /* error handling */
715 );
716
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000717/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000718 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000720 Unicode objects are passed back as-is (subclasses are converted to
721 true Unicode objects), all other objects are delegated to
722 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000723 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000724
725 The API returns NULL in case of an error. The caller is responsible
726 for decref'ing the returned objects.
727
728*/
729
Mark Hammond91a681d2002-08-12 07:21:58 +0000730PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000732 );
733
Victor Stinner1205f272010-09-11 00:54:47 +0000734PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
735 const char *format, /* ASCII-encoded string */
736 va_list vargs
737 );
738PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
739 const char *format, /* ASCII-encoded string */
740 ...
741 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000743#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000744/* Format the object based on the format_spec, as defined in PEP 3101
745 (Advanced String Formatting). */
746PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747 PyObject *format_spec,
748 Py_ssize_t start,
749 Py_ssize_t end);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000750#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000751
Walter Dörwald16807132007-05-25 13:52:07 +0000752PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
753PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000754PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
755 const char *u /* UTF-8 encoded string */
756 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000757#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000758PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000759#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000760
761/* Use only if you know it's a string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762#define PyUnicode_CHECK_INTERNED(op) \
763 (((PyASCIIObject *)(op))->state.interned)
Walter Dörwald16807132007-05-25 13:52:07 +0000764
Guido van Rossumd8225182000-03-10 22:33:05 +0000765/* --- wchar_t support for platforms which support it --------------------- */
766
767#ifdef HAVE_WCHAR_H
768
Georg Brandl952867a2010-06-27 10:17:12 +0000769/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000770 size.
771
772 The buffer is copied into the new object. */
773
Mark Hammond91a681d2002-08-12 07:21:58 +0000774PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000775 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000776 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000777 );
778
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000779/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000780 most size wchar_t characters are copied.
781
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000782 Note that the resulting wchar_t string may or may not be
783 0-terminated. It is the responsibility of the caller to make sure
784 that the wchar_t string is 0-terminated in case this is required by
785 the application.
786
787 Returns the number of wchar_t characters copied (excluding a
788 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000789 error. */
790
Martin v. Löwis18e16552006-02-15 17:27:45 +0000791PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000792 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000793 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000794 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000795 );
796
Victor Stinner137c34c2010-09-29 10:25:54 +0000797/* Convert the Unicode object to a wide character string. The output string
798 always ends with a nul character. If size is not NULL, write the number of
Victor Stinnerd88d9832011-09-06 02:00:05 +0200799 wide characters (excluding the null character) into *size.
Victor Stinner137c34c2010-09-29 10:25:54 +0000800
801 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
802 on success. On error, returns NULL, *size is undefined and raises a
803 MemoryError. */
804
805PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000806 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000807 Py_ssize_t *size /* number of characters of the result */
808 );
809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
811
Guido van Rossumd8225182000-03-10 22:33:05 +0000812#endif
813
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000814/* --- Unicode ordinals --------------------------------------------------- */
815
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816/* Create a Unicode Object from the given Unicode code point ordinal.
817
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000818 The ordinal must be in range(0x10000) on narrow Python builds
819 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
820 raised in case it is not.
821
822*/
823
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000824PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000825
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000826/* --- Free-list management ----------------------------------------------- */
827
828/* Clear the free list used by the Unicode implementation.
829
830 This can be used to release memory used for objects on the free
831 list back to the Python memory allocator.
832
833*/
834
835PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
836
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000838
839 Many of these APIs take two arguments encoding and errors. These
840 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000841 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000842
Georg Brandl952867a2010-06-27 10:17:12 +0000843 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000844
845 Error handling is set by errors which may also be set to NULL
846 meaning to use the default handling defined for the codec. Default
847 error handling for all builtin codecs is "strict" (ValueErrors are
848 raised).
849
850 The codecs all use a similar interface. Only deviation from the
851 generic ones are documented.
852
853*/
854
Fred Drakecb093fe2000-05-09 19:51:53 +0000855/* --- Manage the default encoding ---------------------------------------- */
856
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000857/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000858 Unicode object unicode and the size of the encoded representation
859 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000860
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000861 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 This funcation caches the UTF-8 encoded string in the unicodeobject
864 and subsequent calls will return the same string. The memory is relased
865 when the unicodeobject is deallocated.
866
867 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
868 support the previous internal function with the same behaviour.
869
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000870 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000871 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000872
873 *** If you need to access the Unicode object as UTF-8 bytes string,
874 *** please use PyUnicode_AsUTF8String() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000875*/
876
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000877#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000880 Py_ssize_t *size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000882#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000883
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000884/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000885 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
888 in the unicodeobject.
889
890 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
891 support the previous internal function with the same behaviour.
892
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000893 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000894 extracted from the returned data.
895
896 *** This API is for interpreter INTERNAL USE ONLY and will likely
897 *** be removed or changed for Python 3.1.
898
899 *** If you need to access the Unicode object as UTF-8 bytes string,
900 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000901
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000902*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000903
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000904#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
906#define _PyUnicode_AsString PyUnicode_AsUTF8
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000907#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000908
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000909/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000910
Mark Hammond91a681d2002-08-12 07:21:58 +0000911PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000912
Guido van Rossumd8225182000-03-10 22:33:05 +0000913/* --- Generic Codecs ----------------------------------------------------- */
914
915/* Create a Unicode object by decoding the encoded string s of the
916 given size. */
917
Mark Hammond91a681d2002-08-12 07:21:58 +0000918PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000919 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000920 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000921 const char *encoding, /* encoding */
922 const char *errors /* error handling */
923 );
924
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000925/* Decode a Unicode object unicode and return the result as Python
926 object. */
927
928PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000929 PyObject *unicode, /* Unicode object */
930 const char *encoding, /* encoding */
931 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000932 );
933
934/* Decode a Unicode object unicode and return the result as Unicode
935 object. */
936
937PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000938 PyObject *unicode, /* Unicode object */
939 const char *encoding, /* encoding */
940 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000941 );
942
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000943/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000944 Python string object. */
945
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000946#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000947PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000948 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000949 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000950 const char *encoding, /* encoding */
951 const char *errors /* error handling */
952 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000953#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000954
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000955/* Encodes a Unicode object and returns the result as Python
956 object. */
957
958PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000959 PyObject *unicode, /* Unicode object */
960 const char *encoding, /* encoding */
961 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000962 );
963
Guido van Rossumd8225182000-03-10 22:33:05 +0000964/* Encodes a Unicode object and returns the result as Python string
965 object. */
966
Mark Hammond91a681d2002-08-12 07:21:58 +0000967PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 PyObject *unicode, /* Unicode object */
969 const char *encoding, /* encoding */
970 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000971 );
972
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000973/* Encodes a Unicode object and returns the result as Unicode
974 object. */
975
976PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 PyObject *unicode, /* Unicode object */
978 const char *encoding, /* encoding */
979 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000980 );
981
982/* Build an encoding map. */
983
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000984PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
985 PyObject* string /* 256 character map */
986 );
987
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988/* --- UTF-7 Codecs ------------------------------------------------------- */
989
Mark Hammond91a681d2002-08-12 07:21:58 +0000990PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 const char *string, /* UTF-7 encoded string */
992 Py_ssize_t length, /* size of string */
993 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 );
995
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000996PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000997 const char *string, /* UTF-7 encoded string */
998 Py_ssize_t length, /* size of string */
999 const char *errors, /* error handling */
1000 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001001 );
1002
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001003#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001004PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005 const Py_UNICODE *data, /* Unicode char buffer */
1006 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1007 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
1008 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
1009 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001011#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012
Guido van Rossumd8225182000-03-10 22:33:05 +00001013/* --- UTF-8 Codecs ------------------------------------------------------- */
1014
Mark Hammond91a681d2002-08-12 07:21:58 +00001015PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 const char *string, /* UTF-8 encoded string */
1017 Py_ssize_t length, /* size of string */
1018 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001019 );
1020
Walter Dörwald69652032004-09-07 20:24:22 +00001021PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001022 const char *string, /* UTF-8 encoded string */
1023 Py_ssize_t length, /* size of string */
1024 const char *errors, /* error handling */
1025 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001026 );
1027
Mark Hammond91a681d2002-08-12 07:21:58 +00001028PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001030 );
1031
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001032#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1034 PyObject *unicode,
1035 const char *errors);
1036
Mark Hammond91a681d2002-08-12 07:21:58 +00001037PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001038 const Py_UNICODE *data, /* Unicode char buffer */
1039 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1040 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001041 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001042#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001043
Walter Dörwald41980ca2007-08-16 21:55:45 +00001044/* --- UTF-32 Codecs ------------------------------------------------------ */
1045
1046/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1047 the corresponding Unicode object.
1048
1049 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +00001051
1052 If byteorder is non-NULL, the decoder starts decoding using the
1053 given byte order:
1054
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 *byteorder == -1: little endian
1056 *byteorder == 0: native order
1057 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +00001058
1059 In native mode, the first four bytes of the stream are checked for a
1060 BOM mark. If found, the BOM mark is analysed, the byte order
1061 adjusted and the BOM skipped. In the other modes, no BOM mark
1062 interpretation is done. After completion, *byteorder is set to the
1063 current byte order at the end of input data.
1064
1065 If byteorder is NULL, the codec starts in native order mode.
1066
1067*/
1068
1069PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070 const char *string, /* UTF-32 encoded string */
1071 Py_ssize_t length, /* size of string */
1072 const char *errors, /* error handling */
1073 int *byteorder /* pointer to byteorder to use
1074 0=native;-1=LE,1=BE; updated on
1075 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001076 );
1077
1078PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 const char *string, /* UTF-32 encoded string */
1080 Py_ssize_t length, /* size of string */
1081 const char *errors, /* error handling */
1082 int *byteorder, /* pointer to byteorder to use
1083 0=native;-1=LE,1=BE; updated on
1084 exit */
1085 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001086 );
1087
1088/* Returns a Python string using the UTF-32 encoding in native byte
1089 order. The string always starts with a BOM mark. */
1090
1091PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001093 );
1094
1095/* Returns a Python string object holding the UTF-32 encoded value of
1096 the Unicode data.
1097
1098 If byteorder is not 0, output is written according to the following
1099 byte order:
1100
1101 byteorder == -1: little endian
1102 byteorder == 0: native byte order (writes a BOM mark)
1103 byteorder == 1: big endian
1104
1105 If byteorder is 0, the output string will always start with the
1106 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1107 prepended.
1108
1109*/
1110
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001111#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +00001112PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001113 const Py_UNICODE *data, /* Unicode char buffer */
1114 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1115 const char *errors, /* error handling */
1116 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +00001117 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001118#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00001119
Guido van Rossumd8225182000-03-10 22:33:05 +00001120/* --- UTF-16 Codecs ------------------------------------------------------ */
1121
Guido van Rossum9e896b32000-04-05 20:11:21 +00001122/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +00001123 the corresponding Unicode object.
1124
1125 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +00001127
1128 If byteorder is non-NULL, the decoder starts decoding using the
1129 given byte order:
1130
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 *byteorder == -1: little endian
1132 *byteorder == 0: native order
1133 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +00001134
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001135 In native mode, the first two bytes of the stream are checked for a
1136 BOM mark. If found, the BOM mark is analysed, the byte order
1137 adjusted and the BOM skipped. In the other modes, no BOM mark
1138 interpretation is done. After completion, *byteorder is set to the
1139 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001140
1141 If byteorder is NULL, the codec starts in native order mode.
1142
1143*/
1144
Mark Hammond91a681d2002-08-12 07:21:58 +00001145PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001146 const char *string, /* UTF-16 encoded string */
1147 Py_ssize_t length, /* size of string */
1148 const char *errors, /* error handling */
1149 int *byteorder /* pointer to byteorder to use
1150 0=native;-1=LE,1=BE; updated on
1151 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +00001152 );
1153
Walter Dörwald69652032004-09-07 20:24:22 +00001154PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 const char *string, /* UTF-16 encoded string */
1156 Py_ssize_t length, /* size of string */
1157 const char *errors, /* error handling */
1158 int *byteorder, /* pointer to byteorder to use
1159 0=native;-1=LE,1=BE; updated on
1160 exit */
1161 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +00001162 );
1163
Guido van Rossumd8225182000-03-10 22:33:05 +00001164/* Returns a Python string using the UTF-16 encoding in native byte
1165 order. The string always starts with a BOM mark. */
1166
Mark Hammond91a681d2002-08-12 07:21:58 +00001167PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001169 );
1170
1171/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001172 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001173
1174 If byteorder is not 0, output is written according to the following
1175 byte order:
1176
1177 byteorder == -1: little endian
1178 byteorder == 0: native byte order (writes a BOM mark)
1179 byteorder == 1: big endian
1180
1181 If byteorder is 0, the output string will always start with the
1182 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1183 prepended.
1184
1185 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1186 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001187 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001188
1189*/
1190
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001191#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001192PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 const Py_UNICODE *data, /* Unicode char buffer */
1194 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1195 const char *errors, /* error handling */
1196 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001197 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001198#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001199
1200/* --- Unicode-Escape Codecs ---------------------------------------------- */
1201
Mark Hammond91a681d2002-08-12 07:21:58 +00001202PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 const char *string, /* Unicode-Escape encoded string */
1204 Py_ssize_t length, /* size of string */
1205 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001206 );
1207
Mark Hammond91a681d2002-08-12 07:21:58 +00001208PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001209 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001210 );
1211
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001212#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001213PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 const Py_UNICODE *data, /* Unicode char buffer */
1215 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001216 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001217#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001218
1219/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1220
Mark Hammond91a681d2002-08-12 07:21:58 +00001221PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 const char *string, /* Raw-Unicode-Escape encoded string */
1223 Py_ssize_t length, /* size of string */
1224 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001225 );
1226
Mark Hammond91a681d2002-08-12 07:21:58 +00001227PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001229 );
1230
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001231#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001232PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001233 const Py_UNICODE *data, /* Unicode char buffer */
1234 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001235 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001236#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001237
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001238/* --- Unicode Internal Codec ---------------------------------------------
1239
1240 Only for internal use in _codecsmodule.c */
1241
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001242#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001243PyObject *_PyUnicode_DecodeUnicodeInternal(
1244 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001245 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001246 const char *errors
1247 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001248#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001250/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001251
1252 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1253
1254*/
1255
Mark Hammond91a681d2002-08-12 07:21:58 +00001256PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 const char *string, /* Latin-1 encoded string */
1258 Py_ssize_t length, /* size of string */
1259 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001260 );
1261
Mark Hammond91a681d2002-08-12 07:21:58 +00001262PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001263 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001264 );
1265
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001266#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1268 PyObject* unicode,
1269 const char* errors);
1270
Mark Hammond91a681d2002-08-12 07:21:58 +00001271PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 const Py_UNICODE *data, /* Unicode char buffer */
1273 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1274 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001275 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001276#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001279
1280 Only 7-bit ASCII data is excepted. All other codes generate errors.
1281
1282*/
1283
Mark Hammond91a681d2002-08-12 07:21:58 +00001284PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 const char *string, /* ASCII encoded string */
1286 Py_ssize_t length, /* size of string */
1287 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001288 );
1289
Mark Hammond91a681d2002-08-12 07:21:58 +00001290PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001292 );
1293
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001294#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1296 PyObject* unicode,
1297 const char* errors);
1298
Mark Hammond91a681d2002-08-12 07:21:58 +00001299PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 const Py_UNICODE *data, /* Unicode char buffer */
1301 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1302 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001303 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001304#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001305
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001307
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001309
1310 Decoding mappings must map single string characters to single
1311 Unicode characters, integers (which are then interpreted as Unicode
1312 ordinals) or None (meaning "undefined mapping" and causing an
1313 error).
1314
1315 Encoding mappings must map single Unicode characters to single
1316 string characters, integers (which are then interpreted as Latin-1
1317 ordinals) or None (meaning "undefined mapping" and causing an
1318 error).
1319
1320 If a character lookup fails with a LookupError, the character is
1321 copied as-is meaning that its ordinal value will be interpreted as
1322 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1323 to contain those mappings which map characters to different code
1324 points.
1325
1326*/
1327
Mark Hammond91a681d2002-08-12 07:21:58 +00001328PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 const char *string, /* Encoded string */
1330 Py_ssize_t length, /* size of string */
1331 PyObject *mapping, /* character mapping
1332 (char ordinal -> unicode ordinal) */
1333 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001334 );
1335
Mark Hammond91a681d2002-08-12 07:21:58 +00001336PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 PyObject *unicode, /* Unicode object */
1338 PyObject *mapping /* character mapping
1339 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001340 );
1341
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001342#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001343PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 const Py_UNICODE *data, /* Unicode char buffer */
1345 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1346 PyObject *mapping, /* character mapping
1347 (unicode ordinal -> char ordinal) */
1348 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001349 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001350#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001351
1352/* Translate a Py_UNICODE buffer of the given length by applying a
1353 character mapping table to it and return the resulting Unicode
1354 object.
1355
1356 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001358
1359 Mapping tables may be dictionaries or sequences. Unmapped character
1360 ordinals (ones which cause a LookupError) are left untouched and
1361 are copied as-is.
1362
1363*/
1364
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001365#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001366PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 const Py_UNICODE *data, /* Unicode char buffer */
1368 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1369 PyObject *table, /* Translate table */
1370 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001371 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001372#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001373
Victor Stinner99b95382011-07-04 14:23:54 +02001374#ifdef HAVE_MBCS
Guido van Rossum24bdb042000-03-28 20:29:59 +00001375
Guido van Rossumefec1152000-03-28 02:01:15 +00001376/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001377
Mark Hammond91a681d2002-08-12 07:21:58 +00001378PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001379 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001380 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001381 const char *errors /* error handling */
1382 );
1383
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001384PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1385 const char *string, /* MBCS encoded string */
1386 Py_ssize_t length, /* size of string */
1387 const char *errors, /* error handling */
1388 Py_ssize_t *consumed /* bytes consumed */
1389 );
1390
Mark Hammond91a681d2002-08-12 07:21:58 +00001391PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001392 PyObject *unicode /* Unicode object */
1393 );
1394
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001395#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001396PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001397 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001398 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001399 const char *errors /* error handling */
1400 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001401#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001402
Victor Stinner99b95382011-07-04 14:23:54 +02001403#endif /* HAVE_MBCS */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001404
Guido van Rossum9e896b32000-04-05 20:11:21 +00001405/* --- Decimal Encoder ---------------------------------------------------- */
1406
1407/* Takes a Unicode string holding a decimal value and writes it into
1408 an output buffer using standard ASCII digit codes.
1409
1410 The output buffer has to provide at least length+1 bytes of storage
1411 area. The output string is 0-terminated.
1412
1413 The encoder converts whitespace to ' ', decimal characters to their
1414 corresponding ASCII digit and all other Latin-1 characters except
1415 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1416 are treated as errors. This includes embedded NULL bytes.
1417
1418 Error handling is defined by the errors argument:
1419
1420 NULL or "strict": raise a ValueError
1421 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001423 "replace": replaces illegal characters with '?'
1424
1425 Returns 0 on success, -1 on failure.
1426
1427*/
1428
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001429#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001430PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 Py_UNICODE *s, /* Unicode buffer */
1432 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1433 char *output, /* Output buffer; must have size >= length */
1434 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001435 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001436#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001437
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001438/* Transforms code points that have decimal digit property to the
1439 corresponding ASCII digit code points.
1440
1441 Returns a new Unicode string on success, NULL on failure.
1442*/
1443
Georg Brandlb5503082010-12-05 11:40:48 +00001444#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001445PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1446 Py_UNICODE *s, /* Unicode buffer */
1447 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1448 );
Georg Brandlb5503082010-12-05 11:40:48 +00001449#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1452 as argument instead of a raw buffer and length. This function additionally
1453 transforms spaces to ASCII because this is what the callers in longobject,
1454 floatobject, and complexobject did anyways. */
1455
1456#ifndef Py_LIMITED_API
1457PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1458 PyObject *unicode /* Unicode object */
1459 );
1460#endif
1461
Martin v. Löwis011e8422009-05-05 04:43:17 +00001462/* --- File system encoding ---------------------------------------------- */
1463
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001464/* ParseTuple converter: encode str objects to bytes using
1465 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001466
1467PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1468
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001469/* ParseTuple converter: decode bytes objects to unicode using
1470 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1471
1472PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1473
Victor Stinner77c38622010-05-14 15:58:55 +00001474/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1475 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001476
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001477 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1478 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001479
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001480 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001481*/
1482
1483PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1484 const char *s /* encoded string */
1485 );
1486
Victor Stinner77c38622010-05-14 15:58:55 +00001487/* Decode a string using Py_FileSystemDefaultEncoding
1488 and the "surrogateescape" error handler.
1489
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001490 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1491 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001492*/
1493
Martin v. Löwis011e8422009-05-05 04:43:17 +00001494PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1495 const char *s, /* encoded string */
1496 Py_ssize_t size /* size */
1497 );
1498
Victor Stinnerae6265f2010-05-15 16:27:27 +00001499/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001500 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001501
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001502 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1503 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001504*/
1505
1506PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1507 PyObject *unicode
1508 );
1509
Guido van Rossumd8225182000-03-10 22:33:05 +00001510/* --- Methods & Slots ----------------------------------------------------
1511
1512 These are capable of handling Unicode objects and strings on input
1513 (we refer to them as strings in the descriptions) and return
1514 Unicode objects or integers as apporpriate. */
1515
1516/* Concat two strings giving a new Unicode string. */
1517
Mark Hammond91a681d2002-08-12 07:21:58 +00001518PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001519 PyObject *left, /* Left string */
1520 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001521 );
1522
Walter Dörwald1ab83302007-05-18 17:15:44 +00001523/* Concat two strings and put the result in *pleft
1524 (sets *pleft to NULL on error) */
1525
1526PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 PyObject **pleft, /* Pointer to left string */
1528 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001529 );
1530
1531/* Concat two strings, put the result in *pleft and drop the right object
1532 (sets *pleft to NULL on error) */
1533
1534PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001535 PyObject **pleft, /* Pointer to left string */
1536 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001537 );
1538
Guido van Rossumd8225182000-03-10 22:33:05 +00001539/* Split a string giving a list of Unicode strings.
1540
1541 If sep is NULL, splitting will be done at all whitespace
1542 substrings. Otherwise, splits occur at the given separator.
1543
1544 At most maxsplit splits will be done. If negative, no limit is set.
1545
1546 Separators are not included in the resulting list.
1547
1548*/
1549
Mark Hammond91a681d2002-08-12 07:21:58 +00001550PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 PyObject *s, /* String to split */
1552 PyObject *sep, /* String separator */
1553 Py_ssize_t maxsplit /* Maxsplit count */
1554 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001555
1556/* Dito, but split at line breaks.
1557
1558 CRLF is considered to be one line break. Line breaks are not
1559 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001560
Mark Hammond91a681d2002-08-12 07:21:58 +00001561PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001562 PyObject *s, /* String to split */
1563 int keepends /* If true, line end markers are included */
1564 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001565
Thomas Wouters477c8d52006-05-27 19:21:47 +00001566/* Partition a string using a given separator. */
1567
1568PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001569 PyObject *s, /* String to partition */
1570 PyObject *sep /* String separator */
1571 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001572
1573/* Partition a string using a given separator, searching from the end of the
1574 string. */
1575
1576PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001577 PyObject *s, /* String to partition */
1578 PyObject *sep /* String separator */
1579 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001580
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001581/* Split a string giving a list of Unicode strings.
1582
1583 If sep is NULL, splitting will be done at all whitespace
1584 substrings. Otherwise, splits occur at the given separator.
1585
1586 At most maxsplit splits will be done. But unlike PyUnicode_Split
1587 PyUnicode_RSplit splits from the end of the string. If negative,
1588 no limit is set.
1589
1590 Separators are not included in the resulting list.
1591
1592*/
1593
1594PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 PyObject *s, /* String to split */
1596 PyObject *sep, /* String separator */
1597 Py_ssize_t maxsplit /* Maxsplit count */
1598 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001599
Guido van Rossumd8225182000-03-10 22:33:05 +00001600/* Translate a string by applying a character mapping table to it and
1601 return the resulting Unicode object.
1602
1603 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001604 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001605
1606 Mapping tables may be dictionaries or sequences. Unmapped character
1607 ordinals (ones which cause a LookupError) are left untouched and
1608 are copied as-is.
1609
1610*/
1611
Mark Hammond91a681d2002-08-12 07:21:58 +00001612PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001613 PyObject *str, /* String */
1614 PyObject *table, /* Translate table */
1615 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001616 );
1617
1618/* Join a sequence of strings using the given separator and return
1619 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001620
Mark Hammond91a681d2002-08-12 07:21:58 +00001621PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001622 PyObject *separator, /* Separator string */
1623 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001624 );
1625
1626/* Return 1 if substr matches str[start:end] at the given tail end, 0
1627 otherwise. */
1628
Martin v. Löwis18e16552006-02-15 17:27:45 +00001629PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001630 PyObject *str, /* String */
1631 PyObject *substr, /* Prefix or Suffix string */
1632 Py_ssize_t start, /* Start index */
1633 Py_ssize_t end, /* Stop index */
1634 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001635 );
1636
1637/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001638 given search direction or -1 if not found. -2 is returned in case
1639 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001640
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001642 PyObject *str, /* String */
1643 PyObject *substr, /* Substring to find */
1644 Py_ssize_t start, /* Start index */
1645 Py_ssize_t end, /* Stop index */
1646 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001647 );
1648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649/* Like PyUnicode_Find, but search for single character only. */
1650PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1651 PyObject *str,
1652 Py_UCS4 ch,
1653 Py_ssize_t start,
1654 Py_ssize_t end,
1655 int direction
1656 );
1657
Barry Warsaw51ac5802000-03-20 16:36:48 +00001658/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001659
Martin v. Löwis18e16552006-02-15 17:27:45 +00001660PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001661 PyObject *str, /* String */
1662 PyObject *substr, /* Substring to count */
1663 Py_ssize_t start, /* Start index */
1664 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001665 );
1666
Barry Warsaw51ac5802000-03-20 16:36:48 +00001667/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001668 and return the resulting Unicode object. */
1669
Mark Hammond91a681d2002-08-12 07:21:58 +00001670PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001671 PyObject *str, /* String */
1672 PyObject *substr, /* Substring to find */
1673 PyObject *replstr, /* Substring to replace */
1674 Py_ssize_t maxcount /* Max. number of replacements to apply;
1675 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001676 );
1677
1678/* Compare two strings and return -1, 0, 1 for less than, equal,
1679 greater than resp. */
1680
Mark Hammond91a681d2002-08-12 07:21:58 +00001681PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001682 PyObject *left, /* Left string */
1683 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001684 );
1685
Martin v. Löwis5b222132007-06-10 09:51:05 +00001686PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1687 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001688 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001689 );
1690
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001691/* Rich compare two strings and return one of the following:
1692
1693 - NULL in case an exception was raised
1694 - Py_True or Py_False for successfuly comparisons
1695 - Py_NotImplemented in case the type combination is unknown
1696
1697 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1698 case the conversion of the arguments to Unicode fails with a
1699 UnicodeDecodeError.
1700
1701 Possible values for op:
1702
1703 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1704
1705*/
1706
1707PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001708 PyObject *left, /* Left string */
1709 PyObject *right, /* Right string */
1710 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001711 );
1712
Thomas Wouters7e474022000-07-16 12:04:32 +00001713/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001714 the resulting Unicode string. */
1715
Mark Hammond91a681d2002-08-12 07:21:58 +00001716PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001717 PyObject *format, /* Format string */
1718 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001719 );
1720
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001721/* Checks whether element is contained in container and return 1/0
1722 accordingly.
1723
1724 element has to coerce to an one element Unicode string. -1 is
1725 returned in case of an error. */
1726
Mark Hammond91a681d2002-08-12 07:21:58 +00001727PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001728 PyObject *container, /* Container string */
1729 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001730 );
1731
Martin v. Löwis47383402007-08-15 07:32:56 +00001732/* Checks whether argument is a valid identifier. */
1733
1734PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1735
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001736#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001737/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001738PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001739 PyUnicodeObject *self,
1740 int striptype,
1741 PyObject *sepobj
1742 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001743#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001744
Eric Smith5807c412008-05-11 21:00:57 +00001745/* Using the current locale, insert the thousands grouping
1746 into the string pointed to by buffer. For the argument descriptions,
1747 see Objects/stringlib/localeutil.h */
1748
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001749#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001750PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1751 Py_ssize_t n_buffer,
1752 Py_UNICODE *digits,
1753 Py_ssize_t n_digits,
1754 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001755#endif
Eric Smith5807c412008-05-11 21:00:57 +00001756
Eric Smitha3b1ac82009-04-03 14:45:06 +00001757/* Using explicit passed-in values, insert the thousands grouping
1758 into the string pointed to by buffer. For the argument descriptions,
1759 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001760#ifndef Py_LIMITED_API
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1762 int kind,
1763 void *buffer,
1764 Py_ssize_t n_buffer,
1765 void *digits,
1766 Py_ssize_t n_digits,
1767 Py_ssize_t min_width,
1768 const char *grouping,
1769 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001770#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001771/* === Characters Type APIs =============================================== */
1772
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001773/* Helper array used by Py_UNICODE_ISSPACE(). */
1774
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001775#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001776PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1777
Guido van Rossumd8225182000-03-10 22:33:05 +00001778/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001779 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001780
1781 These APIs are implemented in Objects/unicodectype.c.
1782
1783*/
1784
Mark Hammond91a681d2002-08-12 07:21:58 +00001785PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001786 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001787 );
1788
Mark Hammond91a681d2002-08-12 07:21:58 +00001789PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001790 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001791 );
1792
Mark Hammond91a681d2002-08-12 07:21:58 +00001793PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001794 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001795 );
1796
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001797PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001798 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001799 );
1800
1801PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001802 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001803 );
1804
Mark Hammond91a681d2002-08-12 07:21:58 +00001805PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001806 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001807 );
1808
Mark Hammond91a681d2002-08-12 07:21:58 +00001809PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001810 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001811 );
1812
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001813PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1814 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001815 );
1816
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001817PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1818 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001819 );
1820
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001821PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1822 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001823 );
1824
Mark Hammond91a681d2002-08-12 07:21:58 +00001825PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001826 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001827 );
1828
Mark Hammond91a681d2002-08-12 07:21:58 +00001829PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001830 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001831 );
1832
Mark Hammond91a681d2002-08-12 07:21:58 +00001833PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001834 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001835 );
1836
Mark Hammond91a681d2002-08-12 07:21:58 +00001837PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001838 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001839 );
1840
Mark Hammond91a681d2002-08-12 07:21:58 +00001841PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001842 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001843 );
1844
Mark Hammond91a681d2002-08-12 07:21:58 +00001845PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001846 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001847 );
1848
Georg Brandl559e5d72008-06-11 18:37:52 +00001849PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001850 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001851 );
1852
Mark Hammond91a681d2002-08-12 07:21:58 +00001853PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001854 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001855 );
1856
Victor Stinneref8d95c2010-08-16 22:03:11 +00001857PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1858 const Py_UNICODE *u
1859 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001860
1861PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001862 Py_UNICODE *s1,
1863 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001864
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001865PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1866 Py_UNICODE *s1, const Py_UNICODE *s2);
1867
Martin v. Löwis5b222132007-06-10 09:51:05 +00001868PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001869 Py_UNICODE *s1,
1870 const Py_UNICODE *s2,
1871 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001872
1873PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001874 const Py_UNICODE *s1,
1875 const Py_UNICODE *s2
1876 );
1877
1878PyAPI_FUNC(int) Py_UNICODE_strncmp(
1879 const Py_UNICODE *s1,
1880 const Py_UNICODE *s2,
1881 size_t n
1882 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001883
1884PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001885 const Py_UNICODE *s,
1886 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001887 );
1888
Victor Stinner331ea922010-08-10 16:37:20 +00001889PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001890 const Py_UNICODE *s,
1891 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001892 );
1893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894PyAPI_FUNC(size_t) Py_UCS4_strlen(
1895 const Py_UCS4 *u
1896 );
1897
1898PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1899 Py_UCS4 *s1,
1900 const Py_UCS4 *s2);
1901
1902PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1903 Py_UCS4 *s1, const Py_UCS4 *s2);
1904
1905PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1906 Py_UCS4 *s1,
1907 const Py_UCS4 *s2,
1908 size_t n);
1909
1910PyAPI_FUNC(int) Py_UCS4_strcmp(
1911 const Py_UCS4 *s1,
1912 const Py_UCS4 *s2
1913 );
1914
1915PyAPI_FUNC(int) Py_UCS4_strncmp(
1916 const Py_UCS4 *s1,
1917 const Py_UCS4 *s2,
1918 size_t n
1919 );
1920
1921PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1922 const Py_UCS4 *s,
1923 Py_UCS4 c
1924 );
1925
1926PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1927 const Py_UCS4 *s,
1928 Py_UCS4 c
1929 );
1930
Victor Stinner71133ff2010-09-01 23:43:53 +00001931/* Create a copy of a unicode string ending with a nul character. Return NULL
1932 and raise a MemoryError exception on memory allocation failure, otherwise
1933 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1934
Victor Stinner46408602010-09-03 16:18:00 +00001935PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001936 PyObject *unicode
1937 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001938#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001939
Guido van Rossumd8225182000-03-10 22:33:05 +00001940#ifdef __cplusplus
1941}
1942#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001943#endif /* !Py_UNICODEOBJECT_H */