blob: 4d2a8e4d63f6bfdd2347cae383fe71788b59b5f8 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Alexander Belopolsky83283c22010-11-16 14:29:01 +000010Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
Guido van Rossumd8225182000-03-10 22:33:05 +000012
Guido van Rossum16b1ad92000-08-03 16:24:25 +000013Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000014
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000032 *
Guido van Rossumd8225182000-03-10 22:33:05 +000033 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 *
Guido van Rossumd8225182000-03-10 22:33:05 +000036 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 *
Guido van Rossumd8225182000-03-10 22:33:05 +000040 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 *
Guido van Rossumd8225182000-03-10 22:33:05 +000049 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000058#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000059
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
Christian Heimes0625e892008-01-07 21:04:21 +000064/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000066
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000067/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
68 properly set, but the default rules below doesn't set it. I'll
69 sort this out some other day -- fredrik@pythonware.com */
70
71#ifndef Py_UNICODE_SIZE
72#error Must define Py_UNICODE_SIZE
73#endif
74
Fredrik Lundh8f455852001-06-27 18:59:43 +000075/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
76 strings are stored as UCS-2 (with limited support for UTF-16) */
77
78#if Py_UNICODE_SIZE >= 4
79#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000080#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000081
Amaury Forgeot d'Arcfeb73072010-09-12 22:42:57 +000082/* Set these flags if the platform has "wchar.h" and the
Guido van Rossumd8225182000-03-10 22:33:05 +000083 wchar_t type is a 16-bit unsigned type */
84/* #define HAVE_WCHAR_H */
85/* #define HAVE_USABLE_WCHAR_T */
86
87/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000088#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000089
Fredrik Lundh1294ad02001-06-26 17:17:07 +000090/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000091# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000092# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000093# define PY_UNICODE_TYPE wchar_t
94# endif
95
Fredrik Lundh8f455852001-06-27 18:59:43 +000096# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000097# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000098# endif
99
100#endif
101
102/* If the compiler provides a wchar_t type we try to support it
Victor Stinner137c34c2010-09-29 10:25:54 +0000103 through the interface functions PyUnicode_FromWideChar(),
104 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
Guido van Rossumd8225182000-03-10 22:33:05 +0000105
106#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000107# ifndef HAVE_WCHAR_H
108# define HAVE_WCHAR_H
109# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000110#endif
111
112#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000113/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
114# ifdef _HAVE_BSDI
115# include <time.h>
116# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000117# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000118#endif
119
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000120/*
121 * Use this typedef when you need to represent a UTF-16 surrogate pair
122 * as single unsigned integer.
123 */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000124#if SIZEOF_INT >= 4
125typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000126#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000128#endif
129
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000130/* Py_UNICODE is the native Unicode storage format (code unit) used by
131 Python and represents a single Unicode element in the Unicode
132 type. */
133
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000134#ifndef Py_LIMITED_API
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000135typedef PY_UNICODE_TYPE Py_UNICODE;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000136#endif
Marc-André Lemburg43279102000-07-07 09:01:41 +0000137
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000138/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
139
140/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
141 produce different external names and thus cause import errors in
142 case Python interpreters and extensions with mixed compiled in
143 Unicode width assumptions are combined. */
144
145#ifndef Py_UNICODE_WIDE
146
147# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
148# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000149# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
150# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000151# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000152# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000153# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000154# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
155# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000156# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000157# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
158# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
159# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
160# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
161# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
Victor Stinner137c34c2010-09-29 10:25:54 +0000162# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000163# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000164# define PyUnicode_Compare PyUnicodeUCS2_Compare
Victor Stinner09f24bb2010-10-24 20:38:25 +0000165# define PyUnicode_CompareWithASCIIString PyUnicodeUCS2_CompareWithASCIIString
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000166# define PyUnicode_Concat PyUnicodeUCS2_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000167# define PyUnicode_Append PyUnicodeUCS2_Append
168# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000169# define PyUnicode_Contains PyUnicodeUCS2_Contains
170# define PyUnicode_Count PyUnicodeUCS2_Count
171# define PyUnicode_Decode PyUnicodeUCS2_Decode
172# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
173# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
174# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000175# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000176# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000177# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000178# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
179# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000180# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000181# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000182# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000183# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000184# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
185# define PyUnicode_Encode PyUnicodeUCS2_Encode
186# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
187# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
188# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
189# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
190# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000191# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000192# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
193# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
194# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
195# define PyUnicode_Find PyUnicodeUCS2_Find
196# define PyUnicode_Format PyUnicodeUCS2_Format
197# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000198# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
199# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000200# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000201# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000202# define PyUnicode_FromString PyUnicodeUCS2_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000203# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000204# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
Walter Dörwald14176a52007-05-18 17:04:42 +0000205# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000206# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000207# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000208# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
209# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
210# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000211# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000212# define PyUnicode_Join PyUnicodeUCS2_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213# define PyUnicode_Partition PyUnicodeUCS2_Partition
214# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
215# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000216# define PyUnicode_Replace PyUnicodeUCS2_Replace
217# define PyUnicode_Resize PyUnicodeUCS2_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000218# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000219# define PyUnicode_Split PyUnicodeUCS2_Split
220# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
221# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
222# define PyUnicode_Translate PyUnicodeUCS2_Translate
223# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
224# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
225# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
226# define _PyUnicode_Init _PyUnicodeUCS2_Init
Victor Stinner71133ff2010-09-01 23:43:53 +0000227# define PyUnicode_strdup PyUnicodeUCS2_strdup
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000228
229#else
230
231# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
232# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000233# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
234# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000235# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000236# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000237# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000238# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
239# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000240# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000241# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
242# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
243# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
244# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
245# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
Victor Stinner137c34c2010-09-29 10:25:54 +0000246# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000247# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000248# define PyUnicode_Compare PyUnicodeUCS4_Compare
Victor Stinner09f24bb2010-10-24 20:38:25 +0000249# define PyUnicode_CompareWithASCIIString PyUnicodeUCS4_CompareWithASCIIString
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000250# define PyUnicode_Concat PyUnicodeUCS4_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000251# define PyUnicode_Append PyUnicodeUCS4_Append
252# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000253# define PyUnicode_Contains PyUnicodeUCS4_Contains
254# define PyUnicode_Count PyUnicodeUCS4_Count
255# define PyUnicode_Decode PyUnicodeUCS4_Decode
256# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
257# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
258# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000259# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000260# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000261# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000262# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
263# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000264# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000265# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000266# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000267# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000268# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
269# define PyUnicode_Encode PyUnicodeUCS4_Encode
270# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
271# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
272# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
273# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
274# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000275# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000276# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
277# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
278# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
279# define PyUnicode_Find PyUnicodeUCS4_Find
280# define PyUnicode_Format PyUnicodeUCS4_Format
281# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000282# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
283# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000284# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000285# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000286# define PyUnicode_FromString PyUnicodeUCS4_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000287# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000288# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000289# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000290# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000291# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000292# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
293# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
294# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000295# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000296# define PyUnicode_Join PyUnicodeUCS4_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000297# define PyUnicode_Partition PyUnicodeUCS4_Partition
298# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
299# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000300# define PyUnicode_Replace PyUnicodeUCS4_Replace
301# define PyUnicode_Resize PyUnicodeUCS4_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000302# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000303# define PyUnicode_Split PyUnicodeUCS4_Split
304# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
305# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
306# define PyUnicode_Translate PyUnicodeUCS4_Translate
307# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
308# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
309# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
310# define _PyUnicode_Init _PyUnicodeUCS4_Init
Victor Stinner71133ff2010-09-01 23:43:53 +0000311# define PyUnicode_strdup PyUnicodeUCS4_strdup
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000312
313#endif
314
Guido van Rossumd8225182000-03-10 22:33:05 +0000315/* --- Internal Unicode Operations ---------------------------------------- */
316
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000317/* Since splitting on whitespace is an important use case, and
318 whitespace in most situations is solely ASCII whitespace, we
319 optimize for the common case by using a quick look-up table
320 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000321
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000322 */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000323#ifndef Py_LIMITED_API
Christian Heimes190d79e2008-01-30 11:58:22 +0000324#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000326
327#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
328#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
329#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
330#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
331
332#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
333#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
334#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
335
336#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
337#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
338#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000339#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000340
341#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
342#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
343#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
344
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000345#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000346
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000347#define Py_UNICODE_ISALNUM(ch) \
348 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000349 Py_UNICODE_ISDECIMAL(ch) || \
350 Py_UNICODE_ISDIGIT(ch) || \
351 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000352
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000353#define Py_UNICODE_COPY(target, source, length) \
354 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000355
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000356#define Py_UNICODE_FILL(target, value, length) \
357 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000359 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000360
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000361/* Check if substring matches at given offset. The offset must be
362 valid, and the substring must not be empty. */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000363
Thomas Wouters477c8d52006-05-27 19:21:47 +0000364#define Py_UNICODE_MATCH(string, offset, substring) \
365 ((*((string)->str + (offset)) == *((substring)->str)) && \
366 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
367 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000368#endif /* Py_LIMITED_API */
Guido van Rossumd8225182000-03-10 22:33:05 +0000369
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370#ifdef __cplusplus
371extern "C" {
372#endif
373
Guido van Rossumd8225182000-03-10 22:33:05 +0000374/* --- Unicode Type ------------------------------------------------------- */
375
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000376#ifndef Py_LIMITED_API
Guido van Rossumd8225182000-03-10 22:33:05 +0000377typedef struct {
378 PyObject_HEAD
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 Py_ssize_t length; /* Length of raw Unicode data in buffer */
380 Py_UNICODE *str; /* Raw Unicode buffer */
Benjamin Peterson8f67d082010-10-17 20:54:53 +0000381 Py_hash_t hash; /* Hash value; -1 if not set */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000382 int state; /* != 0 if interned. In this case the two
383 * references from the dictionary to this object
384 * are *not* counted in ob_refcnt. */
385 PyObject *defenc; /* (Default) Encoded version as Python
386 string, or NULL; this is used for
387 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000388} PyUnicodeObject;
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000389#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000390
Mark Hammond91a681d2002-08-12 07:21:58 +0000391PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000392PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000393
Walter Dörwald16807132007-05-25 13:52:07 +0000394#define SSTATE_NOT_INTERNED 0
395#define SSTATE_INTERNED_MORTAL 1
396#define SSTATE_INTERNED_IMMORTAL 2
397
Thomas Wouters27d517b2007-02-25 20:39:11 +0000398#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000399 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
400#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000401
402/* Fast access macros */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000403#ifndef Py_LIMITED_API
Guido van Rossumd8225182000-03-10 22:33:05 +0000404#define PyUnicode_GET_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
Guido van Rossumd8225182000-03-10 22:33:05 +0000406#define PyUnicode_GET_DATA_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000408#define PyUnicode_AS_UNICODE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000410#define PyUnicode_AS_DATA(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000411 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000412#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000413
414/* --- Constants ---------------------------------------------------------- */
415
416/* This Unicode character will be used as replacement character during
417 decoding if the errors argument is set to "replace". Note: the
418 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
419 Unicode 3.0. */
420
421#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
422
423/* === Public API ========================================================= */
424
425/* --- Plain Py_UNICODE --------------------------------------------------- */
426
427/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000428 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
430 u may be NULL which causes the contents to be undefined. It is the
431 user's responsibility to fill in the needed data afterwards. Note
432 that modifying the Unicode object contents after construction is
433 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000434
435 The buffer is copied into the new object. */
436
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000437#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000438PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000439 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000440 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000441 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000442#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000443
Georg Brandl952867a2010-06-27 10:17:12 +0000444/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000445PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
Victor Stinner0d711162010-12-27 02:39:20 +0000446 const char *u, /* UTF-8 encoded string */
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000447 Py_ssize_t size /* size of buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000448 );
449
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000450/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Georg Brandl952867a2010-06-27 10:17:12 +0000451 UTF-8 encoded bytes */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452PyAPI_FUNC(PyObject*) PyUnicode_FromString(
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000453 const char *u /* UTF-8 encoded string */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 );
455
Guido van Rossumd8225182000-03-10 22:33:05 +0000456/* Return a read-only pointer to the Unicode object's internal
457 Py_UNICODE buffer. */
458
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000459#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000460PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000461 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000462 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000463#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000464
465/* Get the length of the Unicode object. */
466
Martin v. Löwis18e16552006-02-15 17:27:45 +0000467PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000469 );
470
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000471#ifndef Py_LIMITED_API
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000472/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000473PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000474#endif
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475
Guido van Rossum52c23592000-04-10 13:41:41 +0000476/* Resize an already allocated Unicode object to the new size length.
477
478 *unicode is modified to point to the new (resized) object and 0
479 returned on success.
480
481 This API may only be called by the function which also called the
482 Unicode constructor. The refcount on the object must be 1. Otherwise,
483 an error is returned.
484
485 Error handling is implemented as follows: an exception is set, -1
486 is returned and *unicode left untouched.
487
488*/
489
Mark Hammond91a681d2002-08-12 07:21:58 +0000490PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000491 PyObject **unicode, /* Pointer to the Unicode object */
492 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000493 );
494
Guido van Rossumd8225182000-03-10 22:33:05 +0000495/* Coerce obj to an Unicode object and return a reference with
496 *incremented* refcount.
497
498 Coercion is done in the following way:
499
Georg Brandl952867a2010-06-27 10:17:12 +0000500 1. bytes, bytearray and other char buffer compatible objects are decoded
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000501 under the assumptions that they contain data using the UTF-8
502 encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000503
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000504 2. All other objects (including Unicode objects) raise an
505 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000506
507 The API returns NULL in case of an error. The caller is responsible
508 for decref'ing the returned objects.
509
510*/
511
Mark Hammond91a681d2002-08-12 07:21:58 +0000512PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000513 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 const char *encoding, /* encoding */
515 const char *errors /* error handling */
516 );
517
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000518/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000521 Unicode objects are passed back as-is (subclasses are converted to
522 true Unicode objects), all other objects are delegated to
523 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000524 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525
526 The API returns NULL in case of an error. The caller is responsible
527 for decref'ing the returned objects.
528
529*/
530
Mark Hammond91a681d2002-08-12 07:21:58 +0000531PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000533 );
534
Victor Stinner1205f272010-09-11 00:54:47 +0000535PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
536 const char *format, /* ASCII-encoded string */
537 va_list vargs
538 );
539PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
540 const char *format, /* ASCII-encoded string */
541 ...
542 );
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000544#ifndef Py_LIMITED_API
Eric Smith4a7d76d2008-05-30 18:10:19 +0000545/* Format the object based on the format_spec, as defined in PEP 3101
546 (Advanced String Formatting). */
547PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
548 Py_UNICODE *format_spec,
549 Py_ssize_t format_spec_len);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000550#endif
Eric Smith4a7d76d2008-05-30 18:10:19 +0000551
Walter Dörwald16807132007-05-25 13:52:07 +0000552PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
553PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
Victor Stinnerdc2081f2010-12-27 01:49:29 +0000554PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
555 const char *u /* UTF-8 encoded string */
556 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000557#ifndef Py_LIMITED_API
Walter Dörwald16807132007-05-25 13:52:07 +0000558PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000559#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000560
561/* Use only if you know it's a string */
562#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
563
Guido van Rossumd8225182000-03-10 22:33:05 +0000564/* --- wchar_t support for platforms which support it --------------------- */
565
566#ifdef HAVE_WCHAR_H
567
Georg Brandl952867a2010-06-27 10:17:12 +0000568/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000569 size.
570
571 The buffer is copied into the new object. */
572
Mark Hammond91a681d2002-08-12 07:21:58 +0000573PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000574 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000575 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000576 );
577
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000578/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000579 most size wchar_t characters are copied.
580
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000581 Note that the resulting wchar_t string may or may not be
582 0-terminated. It is the responsibility of the caller to make sure
583 that the wchar_t string is 0-terminated in case this is required by
584 the application.
585
586 Returns the number of wchar_t characters copied (excluding a
587 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000588 error. */
589
Martin v. Löwis18e16552006-02-15 17:27:45 +0000590PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000591 PyObject *unicode, /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000592 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000593 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000594 );
595
Victor Stinner137c34c2010-09-29 10:25:54 +0000596/* Convert the Unicode object to a wide character string. The output string
597 always ends with a nul character. If size is not NULL, write the number of
598 wide characters (including the nul character) into *size.
599
600 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
601 on success. On error, returns NULL, *size is undefined and raises a
602 MemoryError. */
603
604PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Victor Stinnerbeb4135b2010-10-07 01:02:42 +0000605 PyObject *unicode, /* Unicode object */
Victor Stinner137c34c2010-09-29 10:25:54 +0000606 Py_ssize_t *size /* number of characters of the result */
607 );
608
Guido van Rossumd8225182000-03-10 22:33:05 +0000609#endif
610
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000611/* --- Unicode ordinals --------------------------------------------------- */
612
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000613/* Create a Unicode Object from the given Unicode code point ordinal.
614
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000615 The ordinal must be in range(0x10000) on narrow Python builds
616 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
617 raised in case it is not.
618
619*/
620
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000621PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000622
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000623/* --- Free-list management ----------------------------------------------- */
624
625/* Clear the free list used by the Unicode implementation.
626
627 This can be used to release memory used for objects on the free
628 list back to the Python memory allocator.
629
630*/
631
632PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
633
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000634/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000635
636 Many of these APIs take two arguments encoding and errors. These
637 parameters encoding and errors have the same semantics as the ones
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000638 of the builtin str() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000639
Georg Brandl952867a2010-06-27 10:17:12 +0000640 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000641
642 Error handling is set by errors which may also be set to NULL
643 meaning to use the default handling defined for the codec. Default
644 error handling for all builtin codecs is "strict" (ValueErrors are
645 raised).
646
647 The codecs all use a similar interface. Only deviation from the
648 generic ones are documented.
649
650*/
651
Fred Drakecb093fe2000-05-09 19:51:53 +0000652/* --- Manage the default encoding ---------------------------------------- */
653
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000654/* Return a Python string holding the default encoded value of the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 Unicode object.
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000656
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000657 Same as PyUnicode_AsUTF8String() except
658 the resulting string is cached in the Unicode object for subsequent
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000659 usage by this function. The cached version is needed to implement
660 the character buffer interface and will live (at least) as long as
661 the Unicode object itself.
662
663 The refcount of the string is *not* incremented.
664
665 *** Exported for internal use by the interpreter only !!! ***
666
667*/
668
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000669#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000670PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000671 PyObject *unicode,
672 const char *errors);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000673#endif
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000674
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000675/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000676 Unicode object unicode and the size of the encoded representation
677 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000678
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000679 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000680
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000681 *** This API is for interpreter INTERNAL USE ONLY and will likely
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000682 *** be removed or changed in the future.
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000683
684 *** If you need to access the Unicode object as UTF-8 bytes string,
685 *** please use PyUnicode_AsUTF8String() instead.
686
Martin v. Löwis5b222132007-06-10 09:51:05 +0000687*/
688
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000689#ifndef Py_LIMITED_API
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000690PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000692 Py_ssize_t *size);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000693#endif
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000694
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000695/* Returns a pointer to the default encoding (UTF-8) of the
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000696 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000697
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000698 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000699 extracted from the returned data.
700
701 *** This API is for interpreter INTERNAL USE ONLY and will likely
702 *** be removed or changed for Python 3.1.
703
704 *** If you need to access the Unicode object as UTF-8 bytes string,
705 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000706
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000707*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000708
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000709#ifndef Py_LIMITED_API
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000710PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000711#endif
Martin v. Löwis5b222132007-06-10 09:51:05 +0000712
Alexander Belopolsky83283c22010-11-16 14:29:01 +0000713/* Returns "utf-8". */
Fred Drakecb093fe2000-05-09 19:51:53 +0000714
Mark Hammond91a681d2002-08-12 07:21:58 +0000715PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000716
Guido van Rossumd8225182000-03-10 22:33:05 +0000717/* --- Generic Codecs ----------------------------------------------------- */
718
719/* Create a Unicode object by decoding the encoded string s of the
720 given size. */
721
Mark Hammond91a681d2002-08-12 07:21:58 +0000722PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000723 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000724 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000725 const char *encoding, /* encoding */
726 const char *errors /* error handling */
727 );
728
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000729/* Decode a Unicode object unicode and return the result as Python
730 object. */
731
732PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyObject *unicode, /* Unicode object */
734 const char *encoding, /* encoding */
735 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000736 );
737
738/* Decode a Unicode object unicode and return the result as Unicode
739 object. */
740
741PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 PyObject *unicode, /* Unicode object */
743 const char *encoding, /* encoding */
744 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000745 );
746
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000748 Python string object. */
749
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000750#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000751PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000752 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000753 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000754 const char *encoding, /* encoding */
755 const char *errors /* error handling */
756 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000757#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000758
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000759/* Encodes a Unicode object and returns the result as Python
760 object. */
761
762PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 PyObject *unicode, /* Unicode object */
764 const char *encoding, /* encoding */
765 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000766 );
767
Guido van Rossumd8225182000-03-10 22:33:05 +0000768/* Encodes a Unicode object and returns the result as Python string
769 object. */
770
Mark Hammond91a681d2002-08-12 07:21:58 +0000771PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 PyObject *unicode, /* Unicode object */
773 const char *encoding, /* encoding */
774 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000775 );
776
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000777/* Encodes a Unicode object and returns the result as Unicode
778 object. */
779
780PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 PyObject *unicode, /* Unicode object */
782 const char *encoding, /* encoding */
783 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000784 );
785
786/* Build an encoding map. */
787
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000788PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
789 PyObject* string /* 256 character map */
790 );
791
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000792/* --- UTF-7 Codecs ------------------------------------------------------- */
793
Mark Hammond91a681d2002-08-12 07:21:58 +0000794PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 const char *string, /* UTF-7 encoded string */
796 Py_ssize_t length, /* size of string */
797 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000798 );
799
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000800PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 const char *string, /* UTF-7 encoded string */
802 Py_ssize_t length, /* size of string */
803 const char *errors, /* error handling */
804 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000805 );
806
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000807#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000808PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 const Py_UNICODE *data, /* Unicode char buffer */
810 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
811 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
812 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
813 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000814 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000815#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000816
Guido van Rossumd8225182000-03-10 22:33:05 +0000817/* --- UTF-8 Codecs ------------------------------------------------------- */
818
Mark Hammond91a681d2002-08-12 07:21:58 +0000819PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 const char *string, /* UTF-8 encoded string */
821 Py_ssize_t length, /* size of string */
822 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000823 );
824
Walter Dörwald69652032004-09-07 20:24:22 +0000825PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 const char *string, /* UTF-8 encoded string */
827 Py_ssize_t length, /* size of string */
828 const char *errors, /* error handling */
829 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000830 );
831
Mark Hammond91a681d2002-08-12 07:21:58 +0000832PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000834 );
835
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000836#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000837PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 const Py_UNICODE *data, /* Unicode char buffer */
839 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
840 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000841 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000842#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000843
Walter Dörwald41980ca2007-08-16 21:55:45 +0000844/* --- UTF-32 Codecs ------------------------------------------------------ */
845
846/* Decodes length bytes from a UTF-32 encoded buffer string and returns
847 the corresponding Unicode object.
848
849 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000851
852 If byteorder is non-NULL, the decoder starts decoding using the
853 given byte order:
854
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000855 *byteorder == -1: little endian
856 *byteorder == 0: native order
857 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000858
859 In native mode, the first four bytes of the stream are checked for a
860 BOM mark. If found, the BOM mark is analysed, the byte order
861 adjusted and the BOM skipped. In the other modes, no BOM mark
862 interpretation is done. After completion, *byteorder is set to the
863 current byte order at the end of input data.
864
865 If byteorder is NULL, the codec starts in native order mode.
866
867*/
868
869PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 const char *string, /* UTF-32 encoded string */
871 Py_ssize_t length, /* size of string */
872 const char *errors, /* error handling */
873 int *byteorder /* pointer to byteorder to use
874 0=native;-1=LE,1=BE; updated on
875 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000876 );
877
878PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 const char *string, /* UTF-32 encoded string */
880 Py_ssize_t length, /* size of string */
881 const char *errors, /* error handling */
882 int *byteorder, /* pointer to byteorder to use
883 0=native;-1=LE,1=BE; updated on
884 exit */
885 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000886 );
887
888/* Returns a Python string using the UTF-32 encoding in native byte
889 order. The string always starts with a BOM mark. */
890
891PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000893 );
894
895/* Returns a Python string object holding the UTF-32 encoded value of
896 the Unicode data.
897
898 If byteorder is not 0, output is written according to the following
899 byte order:
900
901 byteorder == -1: little endian
902 byteorder == 0: native byte order (writes a BOM mark)
903 byteorder == 1: big endian
904
905 If byteorder is 0, the output string will always start with the
906 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
907 prepended.
908
909*/
910
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000911#ifndef Py_LIMITED_API
Walter Dörwald41980ca2007-08-16 21:55:45 +0000912PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000913 const Py_UNICODE *data, /* Unicode char buffer */
914 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
915 const char *errors, /* error handling */
916 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000917 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000918#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +0000919
Guido van Rossumd8225182000-03-10 22:33:05 +0000920/* --- UTF-16 Codecs ------------------------------------------------------ */
921
Guido van Rossum9e896b32000-04-05 20:11:21 +0000922/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000923 the corresponding Unicode object.
924
925 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000926 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000927
928 If byteorder is non-NULL, the decoder starts decoding using the
929 given byte order:
930
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931 *byteorder == -1: little endian
932 *byteorder == 0: native order
933 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000934
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000935 In native mode, the first two bytes of the stream are checked for a
936 BOM mark. If found, the BOM mark is analysed, the byte order
937 adjusted and the BOM skipped. In the other modes, no BOM mark
938 interpretation is done. After completion, *byteorder is set to the
939 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000940
941 If byteorder is NULL, the codec starts in native order mode.
942
943*/
944
Mark Hammond91a681d2002-08-12 07:21:58 +0000945PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 const char *string, /* UTF-16 encoded string */
947 Py_ssize_t length, /* size of string */
948 const char *errors, /* error handling */
949 int *byteorder /* pointer to byteorder to use
950 0=native;-1=LE,1=BE; updated on
951 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000952 );
953
Walter Dörwald69652032004-09-07 20:24:22 +0000954PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 const char *string, /* UTF-16 encoded string */
956 Py_ssize_t length, /* size of string */
957 const char *errors, /* error handling */
958 int *byteorder, /* pointer to byteorder to use
959 0=native;-1=LE,1=BE; updated on
960 exit */
961 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000962 );
963
Guido van Rossumd8225182000-03-10 22:33:05 +0000964/* Returns a Python string using the UTF-16 encoding in native byte
965 order. The string always starts with a BOM mark. */
966
Mark Hammond91a681d2002-08-12 07:21:58 +0000967PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000969 );
970
971/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000972 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000973
974 If byteorder is not 0, output is written according to the following
975 byte order:
976
977 byteorder == -1: little endian
978 byteorder == 0: native byte order (writes a BOM mark)
979 byteorder == 1: big endian
980
981 If byteorder is 0, the output string will always start with the
982 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
983 prepended.
984
985 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
986 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000987 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000988
989*/
990
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000991#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +0000992PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000993 const Py_UNICODE *data, /* Unicode char buffer */
994 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
995 const char *errors, /* error handling */
996 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +0000997 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +0000998#endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000999
1000/* --- Unicode-Escape Codecs ---------------------------------------------- */
1001
Mark Hammond91a681d2002-08-12 07:21:58 +00001002PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001003 const char *string, /* Unicode-Escape encoded string */
1004 Py_ssize_t length, /* size of string */
1005 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001006 );
1007
Mark Hammond91a681d2002-08-12 07:21:58 +00001008PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001010 );
1011
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001012#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 const Py_UNICODE *data, /* Unicode char buffer */
1015 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001017#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001018
1019/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1020
Mark Hammond91a681d2002-08-12 07:21:58 +00001021PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001022 const char *string, /* Raw-Unicode-Escape encoded string */
1023 Py_ssize_t length, /* size of string */
1024 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001025 );
1026
Mark Hammond91a681d2002-08-12 07:21:58 +00001027PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001028 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001029 );
1030
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001031#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001032PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 const Py_UNICODE *data, /* Unicode char buffer */
1034 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001035 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001036#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001037
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001038/* --- Unicode Internal Codec ---------------------------------------------
1039
1040 Only for internal use in _codecsmodule.c */
1041
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001042#ifndef Py_LIMITED_API
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001043PyObject *_PyUnicode_DecodeUnicodeInternal(
1044 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001045 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001046 const char *errors
1047 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001048#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001049
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001051
1052 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1053
1054*/
1055
Mark Hammond91a681d2002-08-12 07:21:58 +00001056PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 const char *string, /* Latin-1 encoded string */
1058 Py_ssize_t length, /* size of string */
1059 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001060 );
1061
Mark Hammond91a681d2002-08-12 07:21:58 +00001062PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001064 );
1065
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001066#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001067PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001068 const Py_UNICODE *data, /* Unicode char buffer */
1069 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1070 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001071 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001072#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001073
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001074/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001075
1076 Only 7-bit ASCII data is excepted. All other codes generate errors.
1077
1078*/
1079
Mark Hammond91a681d2002-08-12 07:21:58 +00001080PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081 const char *string, /* ASCII encoded string */
1082 Py_ssize_t length, /* size of string */
1083 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001084 );
1085
Mark Hammond91a681d2002-08-12 07:21:58 +00001086PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001088 );
1089
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001090#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001091PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 const Py_UNICODE *data, /* Unicode char buffer */
1093 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1094 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001095 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001096#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001097
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001099
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001101
1102 Decoding mappings must map single string characters to single
1103 Unicode characters, integers (which are then interpreted as Unicode
1104 ordinals) or None (meaning "undefined mapping" and causing an
1105 error).
1106
1107 Encoding mappings must map single Unicode characters to single
1108 string characters, integers (which are then interpreted as Latin-1
1109 ordinals) or None (meaning "undefined mapping" and causing an
1110 error).
1111
1112 If a character lookup fails with a LookupError, the character is
1113 copied as-is meaning that its ordinal value will be interpreted as
1114 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1115 to contain those mappings which map characters to different code
1116 points.
1117
1118*/
1119
Mark Hammond91a681d2002-08-12 07:21:58 +00001120PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 const char *string, /* Encoded string */
1122 Py_ssize_t length, /* size of string */
1123 PyObject *mapping, /* character mapping
1124 (char ordinal -> unicode ordinal) */
1125 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001126 );
1127
Mark Hammond91a681d2002-08-12 07:21:58 +00001128PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 PyObject *unicode, /* Unicode object */
1130 PyObject *mapping /* character mapping
1131 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001132 );
1133
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001134#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001135PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 const Py_UNICODE *data, /* Unicode char buffer */
1137 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1138 PyObject *mapping, /* character mapping
1139 (unicode ordinal -> char ordinal) */
1140 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001141 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001142#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001143
1144/* Translate a Py_UNICODE buffer of the given length by applying a
1145 character mapping table to it and return the resulting Unicode
1146 object.
1147
1148 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001150
1151 Mapping tables may be dictionaries or sequences. Unmapped character
1152 ordinals (ones which cause a LookupError) are left untouched and
1153 are copied as-is.
1154
1155*/
1156
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001157#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001158PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001159 const Py_UNICODE *data, /* Unicode char buffer */
1160 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1161 PyObject *table, /* Translate table */
1162 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001163 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001164#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001165
Guido van Rossumefec1152000-03-28 02:01:15 +00001166#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +00001167
Guido van Rossumefec1152000-03-28 02:01:15 +00001168/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001169
Mark Hammond91a681d2002-08-12 07:21:58 +00001170PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001171 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001172 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001173 const char *errors /* error handling */
1174 );
1175
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001176PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1177 const char *string, /* MBCS encoded string */
1178 Py_ssize_t length, /* size of string */
1179 const char *errors, /* error handling */
1180 Py_ssize_t *consumed /* bytes consumed */
1181 );
1182
Mark Hammond91a681d2002-08-12 07:21:58 +00001183PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001184 PyObject *unicode /* Unicode object */
1185 );
1186
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001187#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001188PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001189 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001190 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001191 const char *errors /* error handling */
1192 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001193#endif
Guido van Rossumefec1152000-03-28 02:01:15 +00001194
Guido van Rossumefec1152000-03-28 02:01:15 +00001195#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001196
Guido van Rossum9e896b32000-04-05 20:11:21 +00001197/* --- Decimal Encoder ---------------------------------------------------- */
1198
1199/* Takes a Unicode string holding a decimal value and writes it into
1200 an output buffer using standard ASCII digit codes.
1201
1202 The output buffer has to provide at least length+1 bytes of storage
1203 area. The output string is 0-terminated.
1204
1205 The encoder converts whitespace to ' ', decimal characters to their
1206 corresponding ASCII digit and all other Latin-1 characters except
1207 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1208 are treated as errors. This includes embedded NULL bytes.
1209
1210 Error handling is defined by the errors argument:
1211
1212 NULL or "strict": raise a ValueError
1213 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001215 "replace": replaces illegal characters with '?'
1216
1217 Returns 0 on success, -1 on failure.
1218
1219*/
1220
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001221#ifndef Py_LIMITED_API
Mark Hammond91a681d2002-08-12 07:21:58 +00001222PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 Py_UNICODE *s, /* Unicode buffer */
1224 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1225 char *output, /* Output buffer; must have size >= length */
1226 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001227 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001228#endif
Guido van Rossum9e896b32000-04-05 20:11:21 +00001229
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001230/* Transforms code points that have decimal digit property to the
1231 corresponding ASCII digit code points.
1232
1233 Returns a new Unicode string on success, NULL on failure.
1234*/
1235
Georg Brandlb5503082010-12-05 11:40:48 +00001236#ifndef Py_LIMITED_API
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001237PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1238 Py_UNICODE *s, /* Unicode buffer */
1239 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
1240 );
Georg Brandlb5503082010-12-05 11:40:48 +00001241#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00001242
Martin v. Löwis011e8422009-05-05 04:43:17 +00001243/* --- File system encoding ---------------------------------------------- */
1244
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001245/* ParseTuple converter: encode str objects to bytes using
1246 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001247
1248PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1249
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001250/* ParseTuple converter: decode bytes objects to unicode using
1251 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1252
1253PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1254
Victor Stinner77c38622010-05-14 15:58:55 +00001255/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1256 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001257
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001258 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1259 encoding.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001260
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001261 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001262*/
1263
1264PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1265 const char *s /* encoded string */
1266 );
1267
Victor Stinner77c38622010-05-14 15:58:55 +00001268/* Decode a string using Py_FileSystemDefaultEncoding
1269 and the "surrogateescape" error handler.
1270
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001271 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1272 encoding.
Victor Stinner77c38622010-05-14 15:58:55 +00001273*/
1274
Martin v. Löwis011e8422009-05-05 04:43:17 +00001275PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1276 const char *s, /* encoded string */
1277 Py_ssize_t size /* size */
1278 );
1279
Victor Stinnerae6265f2010-05-15 16:27:27 +00001280/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001281 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001282
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001283 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1284 encoding.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001285*/
1286
1287PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1288 PyObject *unicode
1289 );
1290
Guido van Rossumd8225182000-03-10 22:33:05 +00001291/* --- Methods & Slots ----------------------------------------------------
1292
1293 These are capable of handling Unicode objects and strings on input
1294 (we refer to them as strings in the descriptions) and return
1295 Unicode objects or integers as apporpriate. */
1296
1297/* Concat two strings giving a new Unicode string. */
1298
Mark Hammond91a681d2002-08-12 07:21:58 +00001299PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 PyObject *left, /* Left string */
1301 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 );
1303
Walter Dörwald1ab83302007-05-18 17:15:44 +00001304/* Concat two strings and put the result in *pleft
1305 (sets *pleft to NULL on error) */
1306
1307PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001308 PyObject **pleft, /* Pointer to left string */
1309 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001310 );
1311
1312/* Concat two strings, put the result in *pleft and drop the right object
1313 (sets *pleft to NULL on error) */
1314
1315PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 PyObject **pleft, /* Pointer to left string */
1317 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001318 );
1319
Guido van Rossumd8225182000-03-10 22:33:05 +00001320/* Split a string giving a list of Unicode strings.
1321
1322 If sep is NULL, splitting will be done at all whitespace
1323 substrings. Otherwise, splits occur at the given separator.
1324
1325 At most maxsplit splits will be done. If negative, no limit is set.
1326
1327 Separators are not included in the resulting list.
1328
1329*/
1330
Mark Hammond91a681d2002-08-12 07:21:58 +00001331PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 PyObject *s, /* String to split */
1333 PyObject *sep, /* String separator */
1334 Py_ssize_t maxsplit /* Maxsplit count */
1335 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001336
1337/* Dito, but split at line breaks.
1338
1339 CRLF is considered to be one line break. Line breaks are not
1340 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001341
Mark Hammond91a681d2002-08-12 07:21:58 +00001342PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001343 PyObject *s, /* String to split */
1344 int keepends /* If true, line end markers are included */
1345 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001346
Thomas Wouters477c8d52006-05-27 19:21:47 +00001347/* Partition a string using a given separator. */
1348
1349PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 PyObject *s, /* String to partition */
1351 PyObject *sep /* String separator */
1352 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001353
1354/* Partition a string using a given separator, searching from the end of the
1355 string. */
1356
1357PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001358 PyObject *s, /* String to partition */
1359 PyObject *sep /* String separator */
1360 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001361
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001362/* Split a string giving a list of Unicode strings.
1363
1364 If sep is NULL, splitting will be done at all whitespace
1365 substrings. Otherwise, splits occur at the given separator.
1366
1367 At most maxsplit splits will be done. But unlike PyUnicode_Split
1368 PyUnicode_RSplit splits from the end of the string. If negative,
1369 no limit is set.
1370
1371 Separators are not included in the resulting list.
1372
1373*/
1374
1375PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001376 PyObject *s, /* String to split */
1377 PyObject *sep, /* String separator */
1378 Py_ssize_t maxsplit /* Maxsplit count */
1379 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001380
Guido van Rossumd8225182000-03-10 22:33:05 +00001381/* Translate a string by applying a character mapping table to it and
1382 return the resulting Unicode object.
1383
1384 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001385 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001386
1387 Mapping tables may be dictionaries or sequences. Unmapped character
1388 ordinals (ones which cause a LookupError) are left untouched and
1389 are copied as-is.
1390
1391*/
1392
Mark Hammond91a681d2002-08-12 07:21:58 +00001393PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001394 PyObject *str, /* String */
1395 PyObject *table, /* Translate table */
1396 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001397 );
1398
1399/* Join a sequence of strings using the given separator and return
1400 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401
Mark Hammond91a681d2002-08-12 07:21:58 +00001402PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 PyObject *separator, /* Separator string */
1404 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001405 );
1406
1407/* Return 1 if substr matches str[start:end] at the given tail end, 0
1408 otherwise. */
1409
Martin v. Löwis18e16552006-02-15 17:27:45 +00001410PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 PyObject *str, /* String */
1412 PyObject *substr, /* Prefix or Suffix string */
1413 Py_ssize_t start, /* Start index */
1414 Py_ssize_t end, /* Stop index */
1415 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001416 );
1417
1418/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001419 given search direction or -1 if not found. -2 is returned in case
1420 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001421
Martin v. Löwis18e16552006-02-15 17:27:45 +00001422PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 PyObject *str, /* String */
1424 PyObject *substr, /* Substring to find */
1425 Py_ssize_t start, /* Start index */
1426 Py_ssize_t end, /* Stop index */
1427 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001428 );
1429
Barry Warsaw51ac5802000-03-20 16:36:48 +00001430/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001431
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 PyObject *str, /* String */
1434 PyObject *substr, /* Substring to count */
1435 Py_ssize_t start, /* Start index */
1436 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001437 );
1438
Barry Warsaw51ac5802000-03-20 16:36:48 +00001439/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001440 and return the resulting Unicode object. */
1441
Mark Hammond91a681d2002-08-12 07:21:58 +00001442PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 PyObject *str, /* String */
1444 PyObject *substr, /* Substring to find */
1445 PyObject *replstr, /* Substring to replace */
1446 Py_ssize_t maxcount /* Max. number of replacements to apply;
1447 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001448 );
1449
1450/* Compare two strings and return -1, 0, 1 for less than, equal,
1451 greater than resp. */
1452
Mark Hammond91a681d2002-08-12 07:21:58 +00001453PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001454 PyObject *left, /* Left string */
1455 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001456 );
1457
Martin v. Löwis5b222132007-06-10 09:51:05 +00001458PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1459 PyObject *left,
Victor Stinnerdc2081f2010-12-27 01:49:29 +00001460 const char *right /* ASCII-encoded string */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001461 );
1462
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001463/* Rich compare two strings and return one of the following:
1464
1465 - NULL in case an exception was raised
1466 - Py_True or Py_False for successfuly comparisons
1467 - Py_NotImplemented in case the type combination is unknown
1468
1469 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1470 case the conversion of the arguments to Unicode fails with a
1471 UnicodeDecodeError.
1472
1473 Possible values for op:
1474
1475 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1476
1477*/
1478
1479PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 PyObject *left, /* Left string */
1481 PyObject *right, /* Right string */
1482 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001483 );
1484
Thomas Wouters7e474022000-07-16 12:04:32 +00001485/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001486 the resulting Unicode string. */
1487
Mark Hammond91a681d2002-08-12 07:21:58 +00001488PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001489 PyObject *format, /* Format string */
1490 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001491 );
1492
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001493/* Checks whether element is contained in container and return 1/0
1494 accordingly.
1495
1496 element has to coerce to an one element Unicode string. -1 is
1497 returned in case of an error. */
1498
Mark Hammond91a681d2002-08-12 07:21:58 +00001499PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001500 PyObject *container, /* Container string */
1501 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001502 );
1503
Martin v. Löwis47383402007-08-15 07:32:56 +00001504/* Checks whether argument is a valid identifier. */
1505
1506PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1507
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001508#ifndef Py_LIMITED_API
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001509/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001510PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001511 PyUnicodeObject *self,
1512 int striptype,
1513 PyObject *sepobj
1514 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001515#endif
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001516
Eric Smith5807c412008-05-11 21:00:57 +00001517/* Using the current locale, insert the thousands grouping
1518 into the string pointed to by buffer. For the argument descriptions,
1519 see Objects/stringlib/localeutil.h */
1520
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001521#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001522PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1523 Py_ssize_t n_buffer,
1524 Py_UNICODE *digits,
1525 Py_ssize_t n_digits,
1526 Py_ssize_t min_width);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001527#endif
Eric Smith5807c412008-05-11 21:00:57 +00001528
Eric Smitha3b1ac82009-04-03 14:45:06 +00001529/* Using explicit passed-in values, insert the thousands grouping
1530 into the string pointed to by buffer. For the argument descriptions,
1531 see Objects/stringlib/localeutil.h */
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001532#ifndef Py_LIMITED_API
Eric Smith0923d1d2009-04-16 20:16:10 +00001533PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1534 Py_ssize_t n_buffer,
1535 Py_UNICODE *digits,
1536 Py_ssize_t n_digits,
1537 Py_ssize_t min_width,
1538 const char *grouping,
1539 const char *thousands_sep);
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001540#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001541/* === Characters Type APIs =============================================== */
1542
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001543/* Helper array used by Py_UNICODE_ISSPACE(). */
1544
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001545#ifndef Py_LIMITED_API
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001546PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1547
Guido van Rossumd8225182000-03-10 22:33:05 +00001548/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001549 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001550
1551 These APIs are implemented in Objects/unicodectype.c.
1552
1553*/
1554
Mark Hammond91a681d2002-08-12 07:21:58 +00001555PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001556 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001557 );
1558
Mark Hammond91a681d2002-08-12 07:21:58 +00001559PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001560 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001561 );
1562
Mark Hammond91a681d2002-08-12 07:21:58 +00001563PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001564 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001565 );
1566
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001567PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001568 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001569 );
1570
1571PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001572 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001573 );
1574
Mark Hammond91a681d2002-08-12 07:21:58 +00001575PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001576 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001577 );
1578
Mark Hammond91a681d2002-08-12 07:21:58 +00001579PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001580 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001581 );
1582
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001583PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1584 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001585 );
1586
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001587PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1588 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001589 );
1590
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001591PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1592 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001593 );
1594
Mark Hammond91a681d2002-08-12 07:21:58 +00001595PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001596 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001597 );
1598
Mark Hammond91a681d2002-08-12 07:21:58 +00001599PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001600 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001601 );
1602
Mark Hammond91a681d2002-08-12 07:21:58 +00001603PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001604 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001605 );
1606
Mark Hammond91a681d2002-08-12 07:21:58 +00001607PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001608 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001609 );
1610
Mark Hammond91a681d2002-08-12 07:21:58 +00001611PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001612 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001613 );
1614
Mark Hammond91a681d2002-08-12 07:21:58 +00001615PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001616 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001617 );
1618
Georg Brandl559e5d72008-06-11 18:37:52 +00001619PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001620 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001621 );
1622
Mark Hammond91a681d2002-08-12 07:21:58 +00001623PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001624 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001625 );
1626
Victor Stinneref8d95c2010-08-16 22:03:11 +00001627PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1628 const Py_UNICODE *u
1629 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001630
1631PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001632 Py_UNICODE *s1,
1633 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001634
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001635PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1636 Py_UNICODE *s1, const Py_UNICODE *s2);
1637
Martin v. Löwis5b222132007-06-10 09:51:05 +00001638PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001639 Py_UNICODE *s1,
1640 const Py_UNICODE *s2,
1641 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001642
1643PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001644 const Py_UNICODE *s1,
1645 const Py_UNICODE *s2
1646 );
1647
1648PyAPI_FUNC(int) Py_UNICODE_strncmp(
1649 const Py_UNICODE *s1,
1650 const Py_UNICODE *s2,
1651 size_t n
1652 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001653
1654PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001655 const Py_UNICODE *s,
1656 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001657 );
1658
Victor Stinner331ea922010-08-10 16:37:20 +00001659PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001660 const Py_UNICODE *s,
1661 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001662 );
1663
Victor Stinner71133ff2010-09-01 23:43:53 +00001664/* Create a copy of a unicode string ending with a nul character. Return NULL
1665 and raise a MemoryError exception on memory allocation failure, otherwise
1666 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1667
Victor Stinner46408602010-09-03 16:18:00 +00001668PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner71133ff2010-09-01 23:43:53 +00001669 PyObject *unicode
1670 );
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001671#endif /* Py_LIMITED_API */
Victor Stinner71133ff2010-09-01 23:43:53 +00001672
Guido van Rossumd8225182000-03-10 22:33:05 +00001673#ifdef __cplusplus
1674}
1675#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001676#endif /* !Py_UNICODEOBJECT_H */