blob: 569511f6f76bd8d17d33db275248b1df0a5b4f0e [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
Guido van Rossum16b1ad92000-08-03 16:24:25 +000012Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000013
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python. This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 *
Guido van Rossumd8225182000-03-10 22:33:05 +000032 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000034 *
Guido van Rossumd8225182000-03-10 22:33:05 +000035 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 *
Guido van Rossumd8225182000-03-10 22:33:05 +000039 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000047 *
Guido van Rossumd8225182000-03-10 22:33:05 +000048 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000057#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000058
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
Christian Heimes0625e892008-01-07 21:04:21 +000063/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000065
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000066/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67 properly set, but the default rules below doesn't set it. I'll
68 sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
Fredrik Lundh8f455852001-06-27 18:59:43 +000074/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
75 strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Guido van Rossumd8225182000-03-10 22:33:05 +000081/* Set these flags if the platform has "wchar.h", "wctype.h" and the
82 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000087#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000088
Fredrik Lundh1294ad02001-06-26 17:17:07 +000089/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000090# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000091# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000092# define PY_UNICODE_TYPE wchar_t
93# endif
94
Fredrik Lundh8f455852001-06-27 18:59:43 +000095# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000096# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000097# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102 through the interface functions PyUnicode_FromWideChar() and
103 PyUnicode_AsWideChar(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000106# ifndef HAVE_WCHAR_H
107# define HAVE_WCHAR_H
108# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000109#endif
110
111#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114# include <time.h>
115# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000116# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000117#endif
118
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000125#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000127#endif
128
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000129/* Py_UNICODE is the native Unicode storage format (code unit) used by
130 Python and represents a single Unicode element in the Unicode
131 type. */
132
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000133typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000134
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
136
137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
138 produce different external names and thus cause import errors in
139 case Python interpreters and extensions with mixed compiled in
140 Unicode width assumptions are combined. */
141
142#ifndef Py_UNICODE_WIDE
143
144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000159# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000160# define PyUnicode_Compare PyUnicodeUCS2_Compare
Benjamin Petersonad465f92010-05-07 20:21:26 +0000161# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000162# define PyUnicode_Concat PyUnicodeUCS2_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000163# define PyUnicode_Append PyUnicodeUCS2_Append
164# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000165# define PyUnicode_Contains PyUnicodeUCS2_Contains
166# define PyUnicode_Count PyUnicodeUCS2_Count
167# define PyUnicode_Decode PyUnicodeUCS2_Decode
168# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
169# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
170# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000171# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000172# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000173# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000174# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
175# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000176# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000177# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000178# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000179# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000180# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
181# define PyUnicode_Encode PyUnicodeUCS2_Encode
182# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
183# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
184# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
185# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
186# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000187# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000188# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
189# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
190# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
191# define PyUnicode_Find PyUnicodeUCS2_Find
192# define PyUnicode_Format PyUnicodeUCS2_Format
193# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000194# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
195# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000196# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000197# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000198# define PyUnicode_FromString PyUnicodeUCS2_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000199# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000200# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
Walter Dörwald14176a52007-05-18 17:04:42 +0000201# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000202# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000203# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
204# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
205# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000206# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000207# define PyUnicode_Join PyUnicodeUCS2_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000208# define PyUnicode_Partition PyUnicodeUCS2_Partition
209# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
210# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000211# define PyUnicode_Replace PyUnicodeUCS2_Replace
212# define PyUnicode_Resize PyUnicodeUCS2_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000213# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000214# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
215# define PyUnicode_Split PyUnicodeUCS2_Split
216# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
217# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
218# define PyUnicode_Translate PyUnicodeUCS2_Translate
219# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
220# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
221# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
222# define _PyUnicode_Init _PyUnicodeUCS2_Init
223# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
224# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
225# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
226# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
227# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
228# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
Georg Brandl559e5d72008-06-11 18:37:52 +0000229# define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000230# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000231# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
232# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000233# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
234# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
235# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
236# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
237# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
238# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
239# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
240# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
241
242#else
243
244# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
245# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000246# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
247# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000248# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000249# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000250# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000251# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
252# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000253# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000254# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
255# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
256# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
257# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
258# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000259# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000260# define PyUnicode_Compare PyUnicodeUCS4_Compare
Benjamin Petersonad465f92010-05-07 20:21:26 +0000261# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000262# define PyUnicode_Concat PyUnicodeUCS4_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000263# define PyUnicode_Append PyUnicodeUCS4_Append
264# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000265# define PyUnicode_Contains PyUnicodeUCS4_Contains
266# define PyUnicode_Count PyUnicodeUCS4_Count
267# define PyUnicode_Decode PyUnicodeUCS4_Decode
268# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
269# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
270# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000271# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000272# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000273# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000274# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
275# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000276# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000277# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000278# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000279# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000280# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
281# define PyUnicode_Encode PyUnicodeUCS4_Encode
282# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
283# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
284# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
285# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
286# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000287# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000288# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
289# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
290# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
291# define PyUnicode_Find PyUnicodeUCS4_Find
292# define PyUnicode_Format PyUnicodeUCS4_Format
293# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000294# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
295# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000296# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000297# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000298# define PyUnicode_FromString PyUnicodeUCS4_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000299# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000300# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000301# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000302# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000303# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
304# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
305# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000306# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000307# define PyUnicode_Join PyUnicodeUCS4_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308# define PyUnicode_Partition PyUnicodeUCS4_Partition
309# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
310# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000311# define PyUnicode_Replace PyUnicodeUCS4_Replace
312# define PyUnicode_Resize PyUnicodeUCS4_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000313# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000314# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
315# define PyUnicode_Split PyUnicodeUCS4_Split
316# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
317# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
318# define PyUnicode_Translate PyUnicodeUCS4_Translate
319# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
320# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
321# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
322# define _PyUnicode_Init _PyUnicodeUCS4_Init
323# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
324# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
325# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
326# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
327# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
328# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
Georg Brandl559e5d72008-06-11 18:37:52 +0000329# define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000330# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000331# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
332# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000333# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
334# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
335# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
336# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
337# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
338# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
339# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
340# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
341
342
343#endif
344
Guido van Rossumd8225182000-03-10 22:33:05 +0000345/* --- Internal Unicode Operations ---------------------------------------- */
346
347/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000348 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
Raymond Hettinger57341c32004-10-31 05:46:59 +0000349 configure Python using --with-wctype-functions. This reduces the
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000351
352#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
353
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000354#include <wctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000355
356#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
357
358#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
359#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
360#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
361#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
362
363#define Py_UNICODE_TOLOWER(ch) towlower(ch)
364#define Py_UNICODE_TOUPPER(ch) towupper(ch)
365#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
366
367#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
368#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
369#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000370#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000371
372#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
373#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
374#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
375
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000376#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
377
Guido van Rossumd8225182000-03-10 22:33:05 +0000378#else
379
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000380/* Since splitting on whitespace is an important use case, and
381 whitespace in most situations is solely ASCII whitespace, we
382 optimize for the common case by using a quick look-up table
383 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000384
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000385 */
Christian Heimes190d79e2008-01-30 11:58:22 +0000386#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000388
389#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
390#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
391#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
392#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
393
394#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
395#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
396#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
397
398#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
399#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
400#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000401#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000402
403#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
404#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
405#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
406
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000407#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000408
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000409#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000410
411#define Py_UNICODE_ISALNUM(ch) \
412 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000413 Py_UNICODE_ISDECIMAL(ch) || \
414 Py_UNICODE_ISDIGIT(ch) || \
415 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000416
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417#define Py_UNICODE_COPY(target, source, length) \
418 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000419
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000420#define Py_UNICODE_FILL(target, value, length) \
421 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000423 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000424
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000425/* Check if substring matches at given offset. the offset must be
Thomas Wouters477c8d52006-05-27 19:21:47 +0000426 valid, and the substring must not be empty */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000427
Thomas Wouters477c8d52006-05-27 19:21:47 +0000428#define Py_UNICODE_MATCH(string, offset, substring) \
429 ((*((string)->str + (offset)) == *((substring)->str)) && \
430 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
431 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000432
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433#ifdef __cplusplus
434extern "C" {
435#endif
436
Guido van Rossumd8225182000-03-10 22:33:05 +0000437/* --- Unicode Type ------------------------------------------------------- */
438
439typedef struct {
440 PyObject_HEAD
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 Py_ssize_t length; /* Length of raw Unicode data in buffer */
442 Py_UNICODE *str; /* Raw Unicode buffer */
443 long hash; /* Hash value; -1 if not set */
444 int state; /* != 0 if interned. In this case the two
445 * references from the dictionary to this object
446 * are *not* counted in ob_refcnt. */
447 PyObject *defenc; /* (Default) Encoded version as Python
448 string, or NULL; this is used for
449 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000450} PyUnicodeObject;
451
Mark Hammond91a681d2002-08-12 07:21:58 +0000452PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000453PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000454
Walter Dörwald16807132007-05-25 13:52:07 +0000455#define SSTATE_NOT_INTERNED 0
456#define SSTATE_INTERNED_MORTAL 1
457#define SSTATE_INTERNED_IMMORTAL 2
458
Thomas Wouters27d517b2007-02-25 20:39:11 +0000459#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000460 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
461#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000462
463/* Fast access macros */
464#define PyUnicode_GET_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
Guido van Rossumd8225182000-03-10 22:33:05 +0000466#define PyUnicode_GET_DATA_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000468#define PyUnicode_AS_UNICODE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000470#define PyUnicode_AS_DATA(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000471 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000472
473/* --- Constants ---------------------------------------------------------- */
474
475/* This Unicode character will be used as replacement character during
476 decoding if the errors argument is set to "replace". Note: the
477 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
478 Unicode 3.0. */
479
480#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
481
482/* === Public API ========================================================= */
483
484/* --- Plain Py_UNICODE --------------------------------------------------- */
485
486/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000487 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000488
489 u may be NULL which causes the contents to be undefined. It is the
490 user's responsibility to fill in the needed data afterwards. Note
491 that modifying the Unicode object contents after construction is
492 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000493
494 The buffer is copied into the new object. */
495
Mark Hammond91a681d2002-08-12 07:21:58 +0000496PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000497 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000498 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000499 );
500
Walter Dörwaldd2034312007-05-18 16:29:38 +0000501/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
502PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
503 const char *u, /* char buffer */
504 Py_ssize_t size /* size of buffer */
505 );
506
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000507/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
508 Latin-1 encoded bytes */
509PyAPI_FUNC(PyObject*) PyUnicode_FromString(
510 const char *u /* string */
511 );
512
Guido van Rossumd8225182000-03-10 22:33:05 +0000513/* Return a read-only pointer to the Unicode object's internal
514 Py_UNICODE buffer. */
515
Mark Hammond91a681d2002-08-12 07:21:58 +0000516PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000518 );
519
520/* Get the length of the Unicode object. */
521
Martin v. Löwis18e16552006-02-15 17:27:45 +0000522PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000523 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000524 );
525
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000526/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000527PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000528
Guido van Rossum52c23592000-04-10 13:41:41 +0000529/* Resize an already allocated Unicode object to the new size length.
530
531 *unicode is modified to point to the new (resized) object and 0
532 returned on success.
533
534 This API may only be called by the function which also called the
535 Unicode constructor. The refcount on the object must be 1. Otherwise,
536 an error is returned.
537
538 Error handling is implemented as follows: an exception is set, -1
539 is returned and *unicode left untouched.
540
541*/
542
Mark Hammond91a681d2002-08-12 07:21:58 +0000543PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000544 PyObject **unicode, /* Pointer to the Unicode object */
545 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000546 );
547
Guido van Rossumd8225182000-03-10 22:33:05 +0000548/* Coerce obj to an Unicode object and return a reference with
549 *incremented* refcount.
550
551 Coercion is done in the following way:
552
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000553 1. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000554 under the assumptions that they contain data using the current
555 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000556
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000557 2. All other objects (including Unicode objects) raise an
558 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000559
560 The API returns NULL in case of an error. The caller is responsible
561 for decref'ing the returned objects.
562
563*/
564
Mark Hammond91a681d2002-08-12 07:21:58 +0000565PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 const char *encoding, /* encoding */
568 const char *errors /* error handling */
569 );
570
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000571/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000573
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000574 Unicode objects are passed back as-is (subclasses are converted to
575 true Unicode objects), all other objects are delegated to
576 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
577 using the default encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000578
579 The API returns NULL in case of an error. The caller is responsible
580 for decref'ing the returned objects.
581
582*/
583
Mark Hammond91a681d2002-08-12 07:21:58 +0000584PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000585 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000586 );
587
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
589PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
590
Eric Smith4a7d76d2008-05-30 18:10:19 +0000591/* Format the object based on the format_spec, as defined in PEP 3101
592 (Advanced String Formatting). */
593PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
594 Py_UNICODE *format_spec,
595 Py_ssize_t format_spec_len);
596
Walter Dörwald16807132007-05-25 13:52:07 +0000597PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
598PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
599PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
600PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
601
602/* Use only if you know it's a string */
603#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
604
Guido van Rossumd8225182000-03-10 22:33:05 +0000605/* --- wchar_t support for platforms which support it --------------------- */
606
607#ifdef HAVE_WCHAR_H
608
609/* Create a Unicode Object from the whcar_t buffer w of the given
610 size.
611
612 The buffer is copied into the new object. */
613
Mark Hammond91a681d2002-08-12 07:21:58 +0000614PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000615 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000616 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000617 );
618
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000619/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000620 most size wchar_t characters are copied.
621
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000622 Note that the resulting wchar_t string may or may not be
623 0-terminated. It is the responsibility of the caller to make sure
624 that the wchar_t string is 0-terminated in case this is required by
625 the application.
626
627 Returns the number of wchar_t characters copied (excluding a
628 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000629 error. */
630
Martin v. Löwis18e16552006-02-15 17:27:45 +0000631PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000632 PyUnicodeObject *unicode, /* Unicode object */
633 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000634 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000635 );
636
637#endif
638
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000639/* --- Unicode ordinals --------------------------------------------------- */
640
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000641/* Create a Unicode Object from the given Unicode code point ordinal.
642
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000643 The ordinal must be in range(0x10000) on narrow Python builds
644 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
645 raised in case it is not.
646
647*/
648
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000649PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000650
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000651/* --- Free-list management ----------------------------------------------- */
652
653/* Clear the free list used by the Unicode implementation.
654
655 This can be used to release memory used for objects on the free
656 list back to the Python memory allocator.
657
658*/
659
660PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
661
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000663
664 Many of these APIs take two arguments encoding and errors. These
665 parameters encoding and errors have the same semantics as the ones
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 of the builtin unicode() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000667
Fred Drakecb093fe2000-05-09 19:51:53 +0000668 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000669
670 Error handling is set by errors which may also be set to NULL
671 meaning to use the default handling defined for the codec. Default
672 error handling for all builtin codecs is "strict" (ValueErrors are
673 raised).
674
675 The codecs all use a similar interface. Only deviation from the
676 generic ones are documented.
677
678*/
679
Fred Drakecb093fe2000-05-09 19:51:53 +0000680/* --- Manage the default encoding ---------------------------------------- */
681
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000682/* Return a Python string holding the default encoded value of the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 Unicode object.
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000684
685 The resulting string is cached in the Unicode object for subsequent
686 usage by this function. The cached version is needed to implement
687 the character buffer interface and will live (at least) as long as
688 the Unicode object itself.
689
690 The refcount of the string is *not* incremented.
691
692 *** Exported for internal use by the interpreter only !!! ***
693
694*/
695
Mark Hammond91a681d2002-08-12 07:21:58 +0000696PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000697 PyObject *unicode,
698 const char *errors);
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000699
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000700/* Returns a pointer to the default encoding (normally, UTF-8) of the
701 Unicode object unicode and the size of the encoded representation
702 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000703
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000704 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000705
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000706 *** This API is for interpreter INTERNAL USE ONLY and will likely
707 *** be removed or changed for Python 3.1.
708
709 *** If you need to access the Unicode object as UTF-8 bytes string,
710 *** please use PyUnicode_AsUTF8String() instead.
711
Martin v. Löwis5b222132007-06-10 09:51:05 +0000712*/
713
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000714PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000716 Py_ssize_t *size);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000717
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000718/* Returns a pointer to the default encoding (normally, UTf-8) of the
719 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000720
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000721 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000722 extracted from the returned data.
723
724 *** This API is for interpreter INTERNAL USE ONLY and will likely
725 *** be removed or changed for Python 3.1.
726
727 *** If you need to access the Unicode object as UTF-8 bytes string,
728 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000729
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000730*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000731
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000732PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +0000733
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000734/* Returns the currently active default encoding.
Fred Drakecb093fe2000-05-09 19:51:53 +0000735
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000736 The default encoding is currently implemented as run-time settable
737 process global. This may change in future versions of the
738 interpreter to become a parameter which is managed on a per-thread
739 basis.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740
Fred Drakecb093fe2000-05-09 19:51:53 +0000741 */
742
Mark Hammond91a681d2002-08-12 07:21:58 +0000743PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000744
745/* Sets the currently active default encoding.
746
747 Returns 0 on success, -1 in case of an error.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748
Fred Drakecb093fe2000-05-09 19:51:53 +0000749 */
750
Mark Hammond91a681d2002-08-12 07:21:58 +0000751PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 const char *encoding /* Encoding name in standard form */
Fred Drakecb093fe2000-05-09 19:51:53 +0000753 );
754
Guido van Rossumd8225182000-03-10 22:33:05 +0000755/* --- Generic Codecs ----------------------------------------------------- */
756
757/* Create a Unicode object by decoding the encoded string s of the
758 given size. */
759
Mark Hammond91a681d2002-08-12 07:21:58 +0000760PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000761 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000762 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000763 const char *encoding, /* encoding */
764 const char *errors /* error handling */
765 );
766
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000767/* Decode a Unicode object unicode and return the result as Python
768 object. */
769
770PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 PyObject *unicode, /* Unicode object */
772 const char *encoding, /* encoding */
773 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000774 );
775
776/* Decode a Unicode object unicode and return the result as Unicode
777 object. */
778
779PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 PyObject *unicode, /* Unicode object */
781 const char *encoding, /* encoding */
782 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000783 );
784
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000786 Python string object. */
787
Mark Hammond91a681d2002-08-12 07:21:58 +0000788PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000789 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000790 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000791 const char *encoding, /* encoding */
792 const char *errors /* error handling */
793 );
794
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000795/* Encodes a Unicode object and returns the result as Python
796 object. */
797
798PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 PyObject *unicode, /* Unicode object */
800 const char *encoding, /* encoding */
801 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000802 );
803
Guido van Rossumd8225182000-03-10 22:33:05 +0000804/* Encodes a Unicode object and returns the result as Python string
805 object. */
806
Mark Hammond91a681d2002-08-12 07:21:58 +0000807PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 PyObject *unicode, /* Unicode object */
809 const char *encoding, /* encoding */
810 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000811 );
812
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000813/* Encodes a Unicode object and returns the result as Unicode
814 object. */
815
816PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 PyObject *unicode, /* Unicode object */
818 const char *encoding, /* encoding */
819 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000820 );
821
822/* Build an encoding map. */
823
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000824PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
825 PyObject* string /* 256 character map */
826 );
827
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828/* --- UTF-7 Codecs ------------------------------------------------------- */
829
Mark Hammond91a681d2002-08-12 07:21:58 +0000830PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 const char *string, /* UTF-7 encoded string */
832 Py_ssize_t length, /* size of string */
833 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000834 );
835
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000836PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 const char *string, /* UTF-7 encoded string */
838 Py_ssize_t length, /* size of string */
839 const char *errors, /* error handling */
840 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000841 );
842
Mark Hammond91a681d2002-08-12 07:21:58 +0000843PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 const Py_UNICODE *data, /* Unicode char buffer */
845 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
846 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
847 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
848 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000849 );
850
Guido van Rossumd8225182000-03-10 22:33:05 +0000851/* --- UTF-8 Codecs ------------------------------------------------------- */
852
Mark Hammond91a681d2002-08-12 07:21:58 +0000853PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 const char *string, /* UTF-8 encoded string */
855 Py_ssize_t length, /* size of string */
856 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000857 );
858
Walter Dörwald69652032004-09-07 20:24:22 +0000859PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 const char *string, /* UTF-8 encoded string */
861 Py_ssize_t length, /* size of string */
862 const char *errors, /* error handling */
863 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000864 );
865
Mark Hammond91a681d2002-08-12 07:21:58 +0000866PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000868 );
869
Mark Hammond91a681d2002-08-12 07:21:58 +0000870PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000871 const Py_UNICODE *data, /* Unicode char buffer */
872 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
873 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000874 );
875
Walter Dörwald41980ca2007-08-16 21:55:45 +0000876/* --- UTF-32 Codecs ------------------------------------------------------ */
877
878/* Decodes length bytes from a UTF-32 encoded buffer string and returns
879 the corresponding Unicode object.
880
881 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000883
884 If byteorder is non-NULL, the decoder starts decoding using the
885 given byte order:
886
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887 *byteorder == -1: little endian
888 *byteorder == 0: native order
889 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000890
891 In native mode, the first four bytes of the stream are checked for a
892 BOM mark. If found, the BOM mark is analysed, the byte order
893 adjusted and the BOM skipped. In the other modes, no BOM mark
894 interpretation is done. After completion, *byteorder is set to the
895 current byte order at the end of input data.
896
897 If byteorder is NULL, the codec starts in native order mode.
898
899*/
900
901PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 const char *string, /* UTF-32 encoded string */
903 Py_ssize_t length, /* size of string */
904 const char *errors, /* error handling */
905 int *byteorder /* pointer to byteorder to use
906 0=native;-1=LE,1=BE; updated on
907 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000908 );
909
910PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 const char *string, /* UTF-32 encoded string */
912 Py_ssize_t length, /* size of string */
913 const char *errors, /* error handling */
914 int *byteorder, /* pointer to byteorder to use
915 0=native;-1=LE,1=BE; updated on
916 exit */
917 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000918 );
919
920/* Returns a Python string using the UTF-32 encoding in native byte
921 order. The string always starts with a BOM mark. */
922
923PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000925 );
926
927/* Returns a Python string object holding the UTF-32 encoded value of
928 the Unicode data.
929
930 If byteorder is not 0, output is written according to the following
931 byte order:
932
933 byteorder == -1: little endian
934 byteorder == 0: native byte order (writes a BOM mark)
935 byteorder == 1: big endian
936
937 If byteorder is 0, the output string will always start with the
938 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
939 prepended.
940
941*/
942
943PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000944 const Py_UNICODE *data, /* Unicode char buffer */
945 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
946 const char *errors, /* error handling */
947 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000948 );
949
Guido van Rossumd8225182000-03-10 22:33:05 +0000950/* --- UTF-16 Codecs ------------------------------------------------------ */
951
Guido van Rossum9e896b32000-04-05 20:11:21 +0000952/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000953 the corresponding Unicode object.
954
955 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000956 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000957
958 If byteorder is non-NULL, the decoder starts decoding using the
959 given byte order:
960
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000961 *byteorder == -1: little endian
962 *byteorder == 0: native order
963 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000964
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000965 In native mode, the first two bytes of the stream are checked for a
966 BOM mark. If found, the BOM mark is analysed, the byte order
967 adjusted and the BOM skipped. In the other modes, no BOM mark
968 interpretation is done. After completion, *byteorder is set to the
969 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000970
971 If byteorder is NULL, the codec starts in native order mode.
972
973*/
974
Mark Hammond91a681d2002-08-12 07:21:58 +0000975PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000976 const char *string, /* UTF-16 encoded string */
977 Py_ssize_t length, /* size of string */
978 const char *errors, /* error handling */
979 int *byteorder /* pointer to byteorder to use
980 0=native;-1=LE,1=BE; updated on
981 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000982 );
983
Walter Dörwald69652032004-09-07 20:24:22 +0000984PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000985 const char *string, /* UTF-16 encoded string */
986 Py_ssize_t length, /* size of string */
987 const char *errors, /* error handling */
988 int *byteorder, /* pointer to byteorder to use
989 0=native;-1=LE,1=BE; updated on
990 exit */
991 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000992 );
993
Guido van Rossumd8225182000-03-10 22:33:05 +0000994/* Returns a Python string using the UTF-16 encoding in native byte
995 order. The string always starts with a BOM mark. */
996
Mark Hammond91a681d2002-08-12 07:21:58 +0000997PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000998 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 );
1000
1001/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +00001002 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +00001003
1004 If byteorder is not 0, output is written according to the following
1005 byte order:
1006
1007 byteorder == -1: little endian
1008 byteorder == 0: native byte order (writes a BOM mark)
1009 byteorder == 1: big endian
1010
1011 If byteorder is 0, the output string will always start with the
1012 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1013 prepended.
1014
1015 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1016 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +00001017 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +00001018
1019*/
1020
Mark Hammond91a681d2002-08-12 07:21:58 +00001021PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001022 const Py_UNICODE *data, /* Unicode char buffer */
1023 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1024 const char *errors, /* error handling */
1025 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001026 );
1027
1028/* --- Unicode-Escape Codecs ---------------------------------------------- */
1029
Mark Hammond91a681d2002-08-12 07:21:58 +00001030PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001031 const char *string, /* Unicode-Escape encoded string */
1032 Py_ssize_t length, /* size of string */
1033 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001034 );
1035
Mark Hammond91a681d2002-08-12 07:21:58 +00001036PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001037 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001038 );
1039
Mark Hammond91a681d2002-08-12 07:21:58 +00001040PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 const Py_UNICODE *data, /* Unicode char buffer */
1042 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001043 );
1044
1045/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1046
Mark Hammond91a681d2002-08-12 07:21:58 +00001047PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 const char *string, /* Raw-Unicode-Escape encoded string */
1049 Py_ssize_t length, /* size of string */
1050 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001051 );
1052
Mark Hammond91a681d2002-08-12 07:21:58 +00001053PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001055 );
1056
Mark Hammond91a681d2002-08-12 07:21:58 +00001057PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001058 const Py_UNICODE *data, /* Unicode char buffer */
1059 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001060 );
1061
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001062/* --- Unicode Internal Codec ---------------------------------------------
1063
1064 Only for internal use in _codecsmodule.c */
1065
1066PyObject *_PyUnicode_DecodeUnicodeInternal(
1067 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001068 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001069 const char *errors
1070 );
1071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001073
1074 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1075
1076*/
1077
Mark Hammond91a681d2002-08-12 07:21:58 +00001078PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 const char *string, /* Latin-1 encoded string */
1080 Py_ssize_t length, /* size of string */
1081 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001082 );
1083
Mark Hammond91a681d2002-08-12 07:21:58 +00001084PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001086 );
1087
Mark Hammond91a681d2002-08-12 07:21:58 +00001088PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089 const Py_UNICODE *data, /* Unicode char buffer */
1090 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1091 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001092 );
1093
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001095
1096 Only 7-bit ASCII data is excepted. All other codes generate errors.
1097
1098*/
1099
Mark Hammond91a681d2002-08-12 07:21:58 +00001100PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 const char *string, /* ASCII encoded string */
1102 Py_ssize_t length, /* size of string */
1103 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001104 );
1105
Mark Hammond91a681d2002-08-12 07:21:58 +00001106PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001108 );
1109
Mark Hammond91a681d2002-08-12 07:21:58 +00001110PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 const Py_UNICODE *data, /* Unicode char buffer */
1112 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1113 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001114 );
1115
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001117
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001119
1120 Decoding mappings must map single string characters to single
1121 Unicode characters, integers (which are then interpreted as Unicode
1122 ordinals) or None (meaning "undefined mapping" and causing an
1123 error).
1124
1125 Encoding mappings must map single Unicode characters to single
1126 string characters, integers (which are then interpreted as Latin-1
1127 ordinals) or None (meaning "undefined mapping" and causing an
1128 error).
1129
1130 If a character lookup fails with a LookupError, the character is
1131 copied as-is meaning that its ordinal value will be interpreted as
1132 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1133 to contain those mappings which map characters to different code
1134 points.
1135
1136*/
1137
Mark Hammond91a681d2002-08-12 07:21:58 +00001138PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001139 const char *string, /* Encoded string */
1140 Py_ssize_t length, /* size of string */
1141 PyObject *mapping, /* character mapping
1142 (char ordinal -> unicode ordinal) */
1143 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001144 );
1145
Mark Hammond91a681d2002-08-12 07:21:58 +00001146PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 PyObject *unicode, /* Unicode object */
1148 PyObject *mapping /* character mapping
1149 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001150 );
1151
Mark Hammond91a681d2002-08-12 07:21:58 +00001152PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 const Py_UNICODE *data, /* Unicode char buffer */
1154 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1155 PyObject *mapping, /* character mapping
1156 (unicode ordinal -> char ordinal) */
1157 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001158 );
1159
1160/* Translate a Py_UNICODE buffer of the given length by applying a
1161 character mapping table to it and return the resulting Unicode
1162 object.
1163
1164 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001166
1167 Mapping tables may be dictionaries or sequences. Unmapped character
1168 ordinals (ones which cause a LookupError) are left untouched and
1169 are copied as-is.
1170
1171*/
1172
Mark Hammond91a681d2002-08-12 07:21:58 +00001173PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001174 const Py_UNICODE *data, /* Unicode char buffer */
1175 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1176 PyObject *table, /* Translate table */
1177 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001178 );
1179
Guido van Rossumefec1152000-03-28 02:01:15 +00001180#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +00001181
Guido van Rossumefec1152000-03-28 02:01:15 +00001182/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001183
Mark Hammond91a681d2002-08-12 07:21:58 +00001184PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001185 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001186 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001187 const char *errors /* error handling */
1188 );
1189
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001190PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1191 const char *string, /* MBCS encoded string */
1192 Py_ssize_t length, /* size of string */
1193 const char *errors, /* error handling */
1194 Py_ssize_t *consumed /* bytes consumed */
1195 );
1196
Mark Hammond91a681d2002-08-12 07:21:58 +00001197PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001198 PyObject *unicode /* Unicode object */
1199 );
1200
Mark Hammond91a681d2002-08-12 07:21:58 +00001201PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001202 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001203 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001204 const char *errors /* error handling */
1205 );
1206
Guido van Rossumefec1152000-03-28 02:01:15 +00001207#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001208
Guido van Rossum9e896b32000-04-05 20:11:21 +00001209/* --- Decimal Encoder ---------------------------------------------------- */
1210
1211/* Takes a Unicode string holding a decimal value and writes it into
1212 an output buffer using standard ASCII digit codes.
1213
1214 The output buffer has to provide at least length+1 bytes of storage
1215 area. The output string is 0-terminated.
1216
1217 The encoder converts whitespace to ' ', decimal characters to their
1218 corresponding ASCII digit and all other Latin-1 characters except
1219 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1220 are treated as errors. This includes embedded NULL bytes.
1221
1222 Error handling is defined by the errors argument:
1223
1224 NULL or "strict": raise a ValueError
1225 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001227 "replace": replaces illegal characters with '?'
1228
1229 Returns 0 on success, -1 on failure.
1230
1231*/
1232
Mark Hammond91a681d2002-08-12 07:21:58 +00001233PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 Py_UNICODE *s, /* Unicode buffer */
1235 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1236 char *output, /* Output buffer; must have size >= length */
1237 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001238 );
1239
Martin v. Löwis011e8422009-05-05 04:43:17 +00001240/* --- File system encoding ---------------------------------------------- */
1241
1242/* ParseTuple converter which converts a Unicode object into the file
Victor Stinner77c38622010-05-14 15:58:55 +00001243 system encoding as a bytes object, using the "surrogateescape" error
1244 handler; bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001245
1246PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1247
Victor Stinner77c38622010-05-14 15:58:55 +00001248/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1249 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001250
Victor Stinner77c38622010-05-14 15:58:55 +00001251 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001252
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001253 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001254*/
1255
1256PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1257 const char *s /* encoded string */
1258 );
1259
Victor Stinner77c38622010-05-14 15:58:55 +00001260/* Decode a string using Py_FileSystemDefaultEncoding
1261 and the "surrogateescape" error handler.
1262
1263 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1264*/
1265
Martin v. Löwis011e8422009-05-05 04:43:17 +00001266PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1267 const char *s, /* encoded string */
1268 Py_ssize_t size /* size */
1269 );
1270
Victor Stinnerae6265f2010-05-15 16:27:27 +00001271/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001272 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001273
1274 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1275*/
1276
1277PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1278 PyObject *unicode
1279 );
1280
Guido van Rossumd8225182000-03-10 22:33:05 +00001281/* --- Methods & Slots ----------------------------------------------------
1282
1283 These are capable of handling Unicode objects and strings on input
1284 (we refer to them as strings in the descriptions) and return
1285 Unicode objects or integers as apporpriate. */
1286
1287/* Concat two strings giving a new Unicode string. */
1288
Mark Hammond91a681d2002-08-12 07:21:58 +00001289PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 PyObject *left, /* Left string */
1291 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001292 );
1293
Walter Dörwald1ab83302007-05-18 17:15:44 +00001294/* Concat two strings and put the result in *pleft
1295 (sets *pleft to NULL on error) */
1296
1297PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 PyObject **pleft, /* Pointer to left string */
1299 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001300 );
1301
1302/* Concat two strings, put the result in *pleft and drop the right object
1303 (sets *pleft to NULL on error) */
1304
1305PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 PyObject **pleft, /* Pointer to left string */
1307 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001308 );
1309
Guido van Rossumd8225182000-03-10 22:33:05 +00001310/* Split a string giving a list of Unicode strings.
1311
1312 If sep is NULL, splitting will be done at all whitespace
1313 substrings. Otherwise, splits occur at the given separator.
1314
1315 At most maxsplit splits will be done. If negative, no limit is set.
1316
1317 Separators are not included in the resulting list.
1318
1319*/
1320
Mark Hammond91a681d2002-08-12 07:21:58 +00001321PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 PyObject *s, /* String to split */
1323 PyObject *sep, /* String separator */
1324 Py_ssize_t maxsplit /* Maxsplit count */
1325 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001326
1327/* Dito, but split at line breaks.
1328
1329 CRLF is considered to be one line break. Line breaks are not
1330 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001331
Mark Hammond91a681d2002-08-12 07:21:58 +00001332PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 PyObject *s, /* String to split */
1334 int keepends /* If true, line end markers are included */
1335 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001336
Thomas Wouters477c8d52006-05-27 19:21:47 +00001337/* Partition a string using a given separator. */
1338
1339PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 PyObject *s, /* String to partition */
1341 PyObject *sep /* String separator */
1342 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001343
1344/* Partition a string using a given separator, searching from the end of the
1345 string. */
1346
1347PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001348 PyObject *s, /* String to partition */
1349 PyObject *sep /* String separator */
1350 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001351
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001352/* Split a string giving a list of Unicode strings.
1353
1354 If sep is NULL, splitting will be done at all whitespace
1355 substrings. Otherwise, splits occur at the given separator.
1356
1357 At most maxsplit splits will be done. But unlike PyUnicode_Split
1358 PyUnicode_RSplit splits from the end of the string. If negative,
1359 no limit is set.
1360
1361 Separators are not included in the resulting list.
1362
1363*/
1364
1365PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 PyObject *s, /* String to split */
1367 PyObject *sep, /* String separator */
1368 Py_ssize_t maxsplit /* Maxsplit count */
1369 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001370
Guido van Rossumd8225182000-03-10 22:33:05 +00001371/* Translate a string by applying a character mapping table to it and
1372 return the resulting Unicode object.
1373
1374 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001376
1377 Mapping tables may be dictionaries or sequences. Unmapped character
1378 ordinals (ones which cause a LookupError) are left untouched and
1379 are copied as-is.
1380
1381*/
1382
Mark Hammond91a681d2002-08-12 07:21:58 +00001383PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 PyObject *str, /* String */
1385 PyObject *table, /* Translate table */
1386 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001387 );
1388
1389/* Join a sequence of strings using the given separator and return
1390 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391
Mark Hammond91a681d2002-08-12 07:21:58 +00001392PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001393 PyObject *separator, /* Separator string */
1394 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001395 );
1396
1397/* Return 1 if substr matches str[start:end] at the given tail end, 0
1398 otherwise. */
1399
Martin v. Löwis18e16552006-02-15 17:27:45 +00001400PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 PyObject *str, /* String */
1402 PyObject *substr, /* Prefix or Suffix string */
1403 Py_ssize_t start, /* Start index */
1404 Py_ssize_t end, /* Stop index */
1405 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001406 );
1407
1408/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001409 given search direction or -1 if not found. -2 is returned in case
1410 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001411
Martin v. Löwis18e16552006-02-15 17:27:45 +00001412PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 PyObject *str, /* String */
1414 PyObject *substr, /* Substring to find */
1415 Py_ssize_t start, /* Start index */
1416 Py_ssize_t end, /* Stop index */
1417 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001418 );
1419
Barry Warsaw51ac5802000-03-20 16:36:48 +00001420/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001421
Martin v. Löwis18e16552006-02-15 17:27:45 +00001422PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001423 PyObject *str, /* String */
1424 PyObject *substr, /* Substring to count */
1425 Py_ssize_t start, /* Start index */
1426 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001427 );
1428
Barry Warsaw51ac5802000-03-20 16:36:48 +00001429/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001430 and return the resulting Unicode object. */
1431
Mark Hammond91a681d2002-08-12 07:21:58 +00001432PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001433 PyObject *str, /* String */
1434 PyObject *substr, /* Substring to find */
1435 PyObject *replstr, /* Substring to replace */
1436 Py_ssize_t maxcount /* Max. number of replacements to apply;
1437 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001438 );
1439
1440/* Compare two strings and return -1, 0, 1 for less than, equal,
1441 greater than resp. */
1442
Mark Hammond91a681d2002-08-12 07:21:58 +00001443PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001444 PyObject *left, /* Left string */
1445 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001446 );
1447
Martin v. Löwis5b222132007-06-10 09:51:05 +00001448PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1449 PyObject *left,
1450 const char *right
1451 );
1452
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001453/* Rich compare two strings and return one of the following:
1454
1455 - NULL in case an exception was raised
1456 - Py_True or Py_False for successfuly comparisons
1457 - Py_NotImplemented in case the type combination is unknown
1458
1459 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1460 case the conversion of the arguments to Unicode fails with a
1461 UnicodeDecodeError.
1462
1463 Possible values for op:
1464
1465 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1466
1467*/
1468
1469PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001470 PyObject *left, /* Left string */
1471 PyObject *right, /* Right string */
1472 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001473 );
1474
Thomas Wouters7e474022000-07-16 12:04:32 +00001475/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001476 the resulting Unicode string. */
1477
Mark Hammond91a681d2002-08-12 07:21:58 +00001478PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001479 PyObject *format, /* Format string */
1480 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001481 );
1482
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001483/* Checks whether element is contained in container and return 1/0
1484 accordingly.
1485
1486 element has to coerce to an one element Unicode string. -1 is
1487 returned in case of an error. */
1488
Mark Hammond91a681d2002-08-12 07:21:58 +00001489PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 PyObject *container, /* Container string */
1491 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001492 );
1493
Martin v. Löwis47383402007-08-15 07:32:56 +00001494/* Checks whether argument is a valid identifier. */
1495
1496PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1497
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001498/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001499PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001500 PyUnicodeObject *self,
1501 int striptype,
1502 PyObject *sepobj
1503 );
1504
Eric Smith5807c412008-05-11 21:00:57 +00001505/* Using the current locale, insert the thousands grouping
1506 into the string pointed to by buffer. For the argument descriptions,
1507 see Objects/stringlib/localeutil.h */
1508
Eric Smith0923d1d2009-04-16 20:16:10 +00001509PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1510 Py_ssize_t n_buffer,
1511 Py_UNICODE *digits,
1512 Py_ssize_t n_digits,
1513 Py_ssize_t min_width);
Eric Smith5807c412008-05-11 21:00:57 +00001514
Eric Smitha3b1ac82009-04-03 14:45:06 +00001515/* Using explicit passed-in values, insert the thousands grouping
1516 into the string pointed to by buffer. For the argument descriptions,
1517 see Objects/stringlib/localeutil.h */
Eric Smith0923d1d2009-04-16 20:16:10 +00001518PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1519 Py_ssize_t n_buffer,
1520 Py_UNICODE *digits,
1521 Py_ssize_t n_digits,
1522 Py_ssize_t min_width,
1523 const char *grouping,
1524 const char *thousands_sep);
Guido van Rossumd8225182000-03-10 22:33:05 +00001525/* === Characters Type APIs =============================================== */
1526
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001527/* Helper array used by Py_UNICODE_ISSPACE(). */
1528
1529PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1530
Guido van Rossumd8225182000-03-10 22:33:05 +00001531/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001532 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001533
1534 These APIs are implemented in Objects/unicodectype.c.
1535
1536*/
1537
Mark Hammond91a681d2002-08-12 07:21:58 +00001538PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001539 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001540 );
1541
Mark Hammond91a681d2002-08-12 07:21:58 +00001542PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001544 );
1545
Mark Hammond91a681d2002-08-12 07:21:58 +00001546PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001547 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001548 );
1549
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001550PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001551 Py_UNICODE ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001552 );
1553
1554PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 Py_UNICODE ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001556 );
1557
Mark Hammond91a681d2002-08-12 07:21:58 +00001558PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001559 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001560 );
1561
Mark Hammond91a681d2002-08-12 07:21:58 +00001562PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001563 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001564 );
1565
Mark Hammond91a681d2002-08-12 07:21:58 +00001566PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001567 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001568 );
1569
Mark Hammond91a681d2002-08-12 07:21:58 +00001570PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001571 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001572 );
1573
Mark Hammond91a681d2002-08-12 07:21:58 +00001574PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001575 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001576 );
1577
Mark Hammond91a681d2002-08-12 07:21:58 +00001578PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001579 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001580 );
1581
Mark Hammond91a681d2002-08-12 07:21:58 +00001582PyAPI_FUNC(int) _PyUnicode_ToDigit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001583 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001584 );
1585
Mark Hammond91a681d2002-08-12 07:21:58 +00001586PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001587 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001588 );
1589
Mark Hammond91a681d2002-08-12 07:21:58 +00001590PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001591 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001592 );
1593
Mark Hammond91a681d2002-08-12 07:21:58 +00001594PyAPI_FUNC(int) _PyUnicode_IsDigit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001595 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001596 );
1597
Mark Hammond91a681d2002-08-12 07:21:58 +00001598PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001599 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001600 );
1601
Georg Brandl559e5d72008-06-11 18:37:52 +00001602PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001603 Py_UNICODE ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001604 );
1605
Mark Hammond91a681d2002-08-12 07:21:58 +00001606PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001607 Py_UNICODE ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001608 );
1609
Martin v. Löwis5b222132007-06-10 09:51:05 +00001610PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1611
1612PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1613 Py_UNICODE *s1, const Py_UNICODE *s2);
1614
1615PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1616 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1617
1618PyAPI_FUNC(int) Py_UNICODE_strcmp(
1619 const Py_UNICODE *s1, const Py_UNICODE *s2);
1620
1621PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1622 const Py_UNICODE *s, Py_UNICODE c
1623 );
1624
Guido van Rossumd8225182000-03-10 22:33:05 +00001625#ifdef __cplusplus
1626}
1627#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001628#endif /* !Py_UNICODEOBJECT_H */