blob: afef5d0ff107a7de1218afd11559dccbc5621bc7 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
Christian Heimesaf98da12008-01-27 15:18:18 +00004#include <stdarg.h>
5
Guido van Rossumd8225182000-03-10 22:33:05 +00006/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
Guido van Rossum16b1ad92000-08-03 16:24:25 +000012Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000013
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python. This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 *
Guido van Rossumd8225182000-03-10 22:33:05 +000032 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000034 *
Guido van Rossumd8225182000-03-10 22:33:05 +000035 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 *
Guido van Rossumd8225182000-03-10 22:33:05 +000039 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000047 *
Guido van Rossumd8225182000-03-10 22:33:05 +000048 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000057#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000058
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
Christian Heimes0625e892008-01-07 21:04:21 +000063/* Python 3.x requires unicode */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064#define Py_USING_UNICODE
Christian Heimes0625e892008-01-07 21:04:21 +000065
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000066/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67 properly set, but the default rules below doesn't set it. I'll
68 sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
Fredrik Lundh8f455852001-06-27 18:59:43 +000074/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
75 strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000079#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000080
Guido van Rossumd8225182000-03-10 22:33:05 +000081/* Set these flags if the platform has "wchar.h", "wctype.h" and the
82 wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000087#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000088
Fredrik Lundh1294ad02001-06-26 17:17:07 +000089/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000090# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000091# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000092# define PY_UNICODE_TYPE wchar_t
93# endif
94
Fredrik Lundh8f455852001-06-27 18:59:43 +000095# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000096# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000097# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102 through the interface functions PyUnicode_FromWideChar() and
103 PyUnicode_AsWideChar(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000106# ifndef HAVE_WCHAR_H
107# define HAVE_WCHAR_H
108# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000109#endif
110
111#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114# include <time.h>
115# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000116# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000117#endif
118
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000125#elif SIZEOF_LONG >= 4
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000127#endif
128
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000129/* Py_UNICODE is the native Unicode storage format (code unit) used by
130 Python and represents a single Unicode element in the Unicode
131 type. */
132
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000133typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000134
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
136
137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
138 produce different external names and thus cause import errors in
139 case Python interpreters and extensions with mixed compiled in
140 Unicode width assumptions are combined. */
141
142#ifndef Py_UNICODE_WIDE
143
144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000159# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000160# define PyUnicode_Compare PyUnicodeUCS2_Compare
Benjamin Petersonad465f92010-05-07 20:21:26 +0000161# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000162# define PyUnicode_Concat PyUnicodeUCS2_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000163# define PyUnicode_Append PyUnicodeUCS2_Append
164# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000165# define PyUnicode_Contains PyUnicodeUCS2_Contains
166# define PyUnicode_Count PyUnicodeUCS2_Count
167# define PyUnicode_Decode PyUnicodeUCS2_Decode
168# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
169# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
170# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000171# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000172# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000173# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000174# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
175# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000176# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000177# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000178# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000179# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000180# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
181# define PyUnicode_Encode PyUnicodeUCS2_Encode
182# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
183# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
184# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
185# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
186# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000187# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000188# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
189# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
190# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
191# define PyUnicode_Find PyUnicodeUCS2_Find
192# define PyUnicode_Format PyUnicodeUCS2_Format
193# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000194# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
195# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000196# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000197# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000198# define PyUnicode_FromString PyUnicodeUCS2_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000199# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000200# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
Walter Dörwald14176a52007-05-18 17:04:42 +0000201# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000202# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000203# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000204# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
205# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
206# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000207# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000208# define PyUnicode_Join PyUnicodeUCS2_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209# define PyUnicode_Partition PyUnicodeUCS2_Partition
210# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
211# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000212# define PyUnicode_Replace PyUnicodeUCS2_Replace
213# define PyUnicode_Resize PyUnicodeUCS2_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000214# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000215# define PyUnicode_Split PyUnicodeUCS2_Split
216# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
217# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
218# define PyUnicode_Translate PyUnicodeUCS2_Translate
219# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
220# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
221# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
222# define _PyUnicode_Init _PyUnicodeUCS2_Init
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000223
224#else
225
226# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
227# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000228# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
229# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000230# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000231# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000232# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000233# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
234# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000235# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000236# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
237# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
238# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
239# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
240# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000241# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000242# define PyUnicode_Compare PyUnicodeUCS4_Compare
Benjamin Petersonad465f92010-05-07 20:21:26 +0000243# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000244# define PyUnicode_Concat PyUnicodeUCS4_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000245# define PyUnicode_Append PyUnicodeUCS4_Append
246# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000247# define PyUnicode_Contains PyUnicodeUCS4_Contains
248# define PyUnicode_Count PyUnicodeUCS4_Count
249# define PyUnicode_Decode PyUnicodeUCS4_Decode
250# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
251# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
252# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000253# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000254# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000255# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000256# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
257# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000258# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000259# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000260# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000261# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000262# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
263# define PyUnicode_Encode PyUnicodeUCS4_Encode
264# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
265# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
266# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
267# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
268# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000269# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000270# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
271# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
272# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
273# define PyUnicode_Find PyUnicodeUCS4_Find
274# define PyUnicode_Format PyUnicodeUCS4_Format
275# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000276# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
277# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000278# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000279# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000280# define PyUnicode_FromString PyUnicodeUCS4_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000281# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
Alexandre Vassalotti15fafbe2008-12-28 02:13:22 +0000282# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000283# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
Martin v. Löwis011e8422009-05-05 04:43:17 +0000284# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
Victor Stinner47fcb5b2010-08-13 23:59:58 +0000285# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000286# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
287# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
288# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000289# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000290# define PyUnicode_Join PyUnicodeUCS4_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291# define PyUnicode_Partition PyUnicodeUCS4_Partition
292# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
293# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000294# define PyUnicode_Replace PyUnicodeUCS4_Replace
295# define PyUnicode_Resize PyUnicodeUCS4_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000296# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000297# define PyUnicode_Split PyUnicodeUCS4_Split
298# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
299# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
300# define PyUnicode_Translate PyUnicodeUCS4_Translate
301# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
302# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
303# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
304# define _PyUnicode_Init _PyUnicodeUCS4_Init
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000305
306
307#endif
308
Guido van Rossumd8225182000-03-10 22:33:05 +0000309/* --- Internal Unicode Operations ---------------------------------------- */
310
311/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000312 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
Raymond Hettinger57341c32004-10-31 05:46:59 +0000313 configure Python using --with-wctype-functions. This reduces the
Barry Warsaw51ac5802000-03-20 16:36:48 +0000314 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000315
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000316#if defined(Py_UNICODE_WIDE) && defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
Guido van Rossumd8225182000-03-10 22:33:05 +0000317
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000318#include <wctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000319
320#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
321
322#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
323#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
324#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
325#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
326
327#define Py_UNICODE_TOLOWER(ch) towlower(ch)
328#define Py_UNICODE_TOUPPER(ch) towupper(ch)
329#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
330
331#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
332#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
333#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000334#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000335
336#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
337#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
338#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
339
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000340#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
341
Guido van Rossumd8225182000-03-10 22:33:05 +0000342#else
343
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000344/* Since splitting on whitespace is an important use case, and
345 whitespace in most situations is solely ASCII whitespace, we
346 optimize for the common case by using a quick look-up table
347 _Py_ascii_whitespace (see below) with an inlined check.
Christian Heimes190d79e2008-01-30 11:58:22 +0000348
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000349 */
Christian Heimes190d79e2008-01-30 11:58:22 +0000350#define Py_UNICODE_ISSPACE(ch) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
Guido van Rossumd8225182000-03-10 22:33:05 +0000352
353#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
354#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
355#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
356#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
357
358#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
359#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
360#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
361
362#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
363#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
364#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
Georg Brandl559e5d72008-06-11 18:37:52 +0000365#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000366
367#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
368#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
369#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
370
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000371#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000372
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000373#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000374
375#define Py_UNICODE_ISALNUM(ch) \
376 (Py_UNICODE_ISALPHA(ch) || \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 Py_UNICODE_ISDECIMAL(ch) || \
378 Py_UNICODE_ISDIGIT(ch) || \
379 Py_UNICODE_ISNUMERIC(ch))
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000380
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381#define Py_UNICODE_COPY(target, source, length) \
382 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000383
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000384#define Py_UNICODE_FILL(target, value, length) \
385 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000387 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000388
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000389/* Check if substring matches at given offset. the offset must be
Thomas Wouters477c8d52006-05-27 19:21:47 +0000390 valid, and the substring must not be empty */
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000391
Thomas Wouters477c8d52006-05-27 19:21:47 +0000392#define Py_UNICODE_MATCH(string, offset, substring) \
393 ((*((string)->str + (offset)) == *((substring)->str)) && \
394 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
395 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000396
Barry Warsaw51ac5802000-03-20 16:36:48 +0000397#ifdef __cplusplus
398extern "C" {
399#endif
400
Guido van Rossumd8225182000-03-10 22:33:05 +0000401/* --- Unicode Type ------------------------------------------------------- */
402
403typedef struct {
404 PyObject_HEAD
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 Py_ssize_t length; /* Length of raw Unicode data in buffer */
406 Py_UNICODE *str; /* Raw Unicode buffer */
407 long hash; /* Hash value; -1 if not set */
408 int state; /* != 0 if interned. In this case the two
409 * references from the dictionary to this object
410 * are *not* counted in ob_refcnt. */
411 PyObject *defenc; /* (Default) Encoded version as Python
412 string, or NULL; this is used for
413 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000414} PyUnicodeObject;
415
Mark Hammond91a681d2002-08-12 07:21:58 +0000416PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Christian Heimesa22e8bd2007-11-29 22:35:39 +0000417PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000418
Walter Dörwald16807132007-05-25 13:52:07 +0000419#define SSTATE_NOT_INTERNED 0
420#define SSTATE_INTERNED_MORTAL 1
421#define SSTATE_INTERNED_IMMORTAL 2
422
Thomas Wouters27d517b2007-02-25 20:39:11 +0000423#define PyUnicode_Check(op) \
Christian Heimes90aa7642007-12-19 02:45:37 +0000424 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
425#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000426
427/* Fast access macros */
428#define PyUnicode_GET_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000429 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
Guido van Rossumd8225182000-03-10 22:33:05 +0000430#define PyUnicode_GET_DATA_SIZE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000431 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000432#define PyUnicode_AS_UNICODE(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000433 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000434#define PyUnicode_AS_DATA(op) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000435 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000436
437/* --- Constants ---------------------------------------------------------- */
438
439/* This Unicode character will be used as replacement character during
440 decoding if the errors argument is set to "replace". Note: the
441 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
442 Unicode 3.0. */
443
444#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
445
446/* === Public API ========================================================= */
447
448/* --- Plain Py_UNICODE --------------------------------------------------- */
449
450/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000451 size.
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000452
453 u may be NULL which causes the contents to be undefined. It is the
454 user's responsibility to fill in the needed data afterwards. Note
455 that modifying the Unicode object contents after construction is
456 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000457
458 The buffer is copied into the new object. */
459
Mark Hammond91a681d2002-08-12 07:21:58 +0000460PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000461 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000462 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000463 );
464
Georg Brandl952867a2010-06-27 10:17:12 +0000465/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000466PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
467 const char *u, /* char buffer */
468 Py_ssize_t size /* size of buffer */
469 );
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Georg Brandl952867a2010-06-27 10:17:12 +0000472 UTF-8 encoded bytes */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000473PyAPI_FUNC(PyObject*) PyUnicode_FromString(
474 const char *u /* string */
475 );
476
Guido van Rossumd8225182000-03-10 22:33:05 +0000477/* Return a read-only pointer to the Unicode object's internal
478 Py_UNICODE buffer. */
479
Mark Hammond91a681d2002-08-12 07:21:58 +0000480PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000481 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000482 );
483
484/* Get the length of the Unicode object. */
485
Martin v. Löwis18e16552006-02-15 17:27:45 +0000486PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000487 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000488 );
489
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000490/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000491PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000492
Guido van Rossum52c23592000-04-10 13:41:41 +0000493/* Resize an already allocated Unicode object to the new size length.
494
495 *unicode is modified to point to the new (resized) object and 0
496 returned on success.
497
498 This API may only be called by the function which also called the
499 Unicode constructor. The refcount on the object must be 1. Otherwise,
500 an error is returned.
501
502 Error handling is implemented as follows: an exception is set, -1
503 is returned and *unicode left untouched.
504
505*/
506
Mark Hammond91a681d2002-08-12 07:21:58 +0000507PyAPI_FUNC(int) PyUnicode_Resize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 PyObject **unicode, /* Pointer to the Unicode object */
509 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000510 );
511
Guido van Rossumd8225182000-03-10 22:33:05 +0000512/* Coerce obj to an Unicode object and return a reference with
513 *incremented* refcount.
514
515 Coercion is done in the following way:
516
Georg Brandl952867a2010-06-27 10:17:12 +0000517 1. bytes, bytearray and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000518 under the assumptions that they contain data using the current
519 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000521 2. All other objects (including Unicode objects) raise an
522 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000523
524 The API returns NULL in case of an error. The caller is responsible
525 for decref'ing the returned objects.
526
527*/
528
Mark Hammond91a681d2002-08-12 07:21:58 +0000529PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 register PyObject *obj, /* Object */
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000531 const char *encoding, /* encoding */
532 const char *errors /* error handling */
533 );
534
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000536 *incremented* refcount.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000537
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 Unicode objects are passed back as-is (subclasses are converted to
539 true Unicode objects), all other objects are delegated to
540 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
Georg Brandl952867a2010-06-27 10:17:12 +0000541 using UTF-8 encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542
543 The API returns NULL in case of an error. The caller is responsible
544 for decref'ing the returned objects.
545
546*/
547
Mark Hammond91a681d2002-08-12 07:21:58 +0000548PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000549 register PyObject *obj /* Object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000550 );
551
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
553PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
554
Eric Smith4a7d76d2008-05-30 18:10:19 +0000555/* Format the object based on the format_spec, as defined in PEP 3101
556 (Advanced String Formatting). */
557PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
558 Py_UNICODE *format_spec,
559 Py_ssize_t format_spec_len);
560
Walter Dörwald16807132007-05-25 13:52:07 +0000561PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
562PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
563PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
564PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
565
566/* Use only if you know it's a string */
567#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
568
Guido van Rossumd8225182000-03-10 22:33:05 +0000569/* --- wchar_t support for platforms which support it --------------------- */
570
571#ifdef HAVE_WCHAR_H
572
Georg Brandl952867a2010-06-27 10:17:12 +0000573/* Create a Unicode Object from the wchar_t buffer w of the given
Guido van Rossumd8225182000-03-10 22:33:05 +0000574 size.
575
576 The buffer is copied into the new object. */
577
Mark Hammond91a681d2002-08-12 07:21:58 +0000578PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000579 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000580 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000581 );
582
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000583/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000584 most size wchar_t characters are copied.
585
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000586 Note that the resulting wchar_t string may or may not be
587 0-terminated. It is the responsibility of the caller to make sure
588 that the wchar_t string is 0-terminated in case this is required by
589 the application.
590
591 Returns the number of wchar_t characters copied (excluding a
592 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000593 error. */
594
Martin v. Löwis18e16552006-02-15 17:27:45 +0000595PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000596 PyUnicodeObject *unicode, /* Unicode object */
597 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000598 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000599 );
600
601#endif
602
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000603/* --- Unicode ordinals --------------------------------------------------- */
604
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000605/* Create a Unicode Object from the given Unicode code point ordinal.
606
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000607 The ordinal must be in range(0x10000) on narrow Python builds
608 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
609 raised in case it is not.
610
611*/
612
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000613PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000614
Benjamin Peterson960cf0f2009-01-09 04:11:44 +0000615/* --- Free-list management ----------------------------------------------- */
616
617/* Clear the free list used by the Unicode implementation.
618
619 This can be used to release memory used for objects on the free
620 list back to the Python memory allocator.
621
622*/
623
624PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
625
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626/* === Builtin Codecs =====================================================
Guido van Rossumd8225182000-03-10 22:33:05 +0000627
628 Many of these APIs take two arguments encoding and errors. These
629 parameters encoding and errors have the same semantics as the ones
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 of the builtin unicode() API.
Guido van Rossumd8225182000-03-10 22:33:05 +0000631
Georg Brandl952867a2010-06-27 10:17:12 +0000632 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000633
634 Error handling is set by errors which may also be set to NULL
635 meaning to use the default handling defined for the codec. Default
636 error handling for all builtin codecs is "strict" (ValueErrors are
637 raised).
638
639 The codecs all use a similar interface. Only deviation from the
640 generic ones are documented.
641
642*/
643
Fred Drakecb093fe2000-05-09 19:51:53 +0000644/* --- Manage the default encoding ---------------------------------------- */
645
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000646/* Return a Python string holding the default encoded value of the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 Unicode object.
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000648
649 The resulting string is cached in the Unicode object for subsequent
650 usage by this function. The cached version is needed to implement
651 the character buffer interface and will live (at least) as long as
652 the Unicode object itself.
653
654 The refcount of the string is *not* incremented.
655
656 *** Exported for internal use by the interpreter only !!! ***
657
658*/
659
Mark Hammond91a681d2002-08-12 07:21:58 +0000660PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000661 PyObject *unicode,
662 const char *errors);
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000663
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000664/* Returns a pointer to the default encoding (normally, UTF-8) of the
665 Unicode object unicode and the size of the encoded representation
666 in bytes stored in *size.
Christian Heimes5894ba72007-11-04 11:43:14 +0000667
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000668 In case of an error, no *size is set.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000669
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000670 *** This API is for interpreter INTERNAL USE ONLY and will likely
671 *** be removed or changed for Python 3.1.
672
673 *** If you need to access the Unicode object as UTF-8 bytes string,
674 *** please use PyUnicode_AsUTF8String() instead.
675
Martin v. Löwis5b222132007-06-10 09:51:05 +0000676*/
677
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000678PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 PyObject *unicode,
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000680 Py_ssize_t *size);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000681
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000682/* Returns a pointer to the default encoding (normally, UTf-8) of the
683 Unicode object unicode.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000684
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000685 Use of this API is DEPRECATED since no size information can be
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000686 extracted from the returned data.
687
688 *** This API is for interpreter INTERNAL USE ONLY and will likely
689 *** be removed or changed for Python 3.1.
690
691 *** If you need to access the Unicode object as UTF-8 bytes string,
692 *** please use PyUnicode_AsUTF8String() instead.
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000693
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000694*/
Martin v. Löwis5b222132007-06-10 09:51:05 +0000695
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000696PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +0000697
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000698/* Returns the currently active default encoding.
Fred Drakecb093fe2000-05-09 19:51:53 +0000699
Marc-André Lemburg9155aa72008-04-29 11:14:08 +0000700 The default encoding is currently implemented as run-time settable
701 process global. This may change in future versions of the
702 interpreter to become a parameter which is managed on a per-thread
703 basis.
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704
Fred Drakecb093fe2000-05-09 19:51:53 +0000705 */
706
Mark Hammond91a681d2002-08-12 07:21:58 +0000707PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000708
Guido van Rossumd8225182000-03-10 22:33:05 +0000709/* --- Generic Codecs ----------------------------------------------------- */
710
711/* Create a Unicode object by decoding the encoded string s of the
712 given size. */
713
Mark Hammond91a681d2002-08-12 07:21:58 +0000714PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000715 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000716 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000717 const char *encoding, /* encoding */
718 const char *errors /* error handling */
719 );
720
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000721/* Decode a Unicode object unicode and return the result as Python
722 object. */
723
724PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 PyObject *unicode, /* Unicode object */
726 const char *encoding, /* encoding */
727 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000728 );
729
730/* Decode a Unicode object unicode and return the result as Unicode
731 object. */
732
733PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000734 PyObject *unicode, /* Unicode object */
735 const char *encoding, /* encoding */
736 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000737 );
738
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739/* Encodes a Py_UNICODE buffer of the given size and returns a
Guido van Rossumd8225182000-03-10 22:33:05 +0000740 Python string object. */
741
Mark Hammond91a681d2002-08-12 07:21:58 +0000742PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000743 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000744 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000745 const char *encoding, /* encoding */
746 const char *errors /* error handling */
747 );
748
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000749/* Encodes a Unicode object and returns the result as Python
750 object. */
751
752PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 PyObject *unicode, /* Unicode object */
754 const char *encoding, /* encoding */
755 const char *errors /* error handling */
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000756 );
757
Guido van Rossumd8225182000-03-10 22:33:05 +0000758/* Encodes a Unicode object and returns the result as Python string
759 object. */
760
Mark Hammond91a681d2002-08-12 07:21:58 +0000761PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 PyObject *unicode, /* Unicode object */
763 const char *encoding, /* encoding */
764 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000765 );
766
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000767/* Encodes a Unicode object and returns the result as Unicode
768 object. */
769
770PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 PyObject *unicode, /* Unicode object */
772 const char *encoding, /* encoding */
773 const char *errors /* error handling */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000774 );
775
776/* Build an encoding map. */
777
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000778PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
779 PyObject* string /* 256 character map */
780 );
781
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000782/* --- UTF-7 Codecs ------------------------------------------------------- */
783
Mark Hammond91a681d2002-08-12 07:21:58 +0000784PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 const char *string, /* UTF-7 encoded string */
786 Py_ssize_t length, /* size of string */
787 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000788 );
789
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000790PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 const char *string, /* UTF-7 encoded string */
792 Py_ssize_t length, /* size of string */
793 const char *errors, /* error handling */
794 Py_ssize_t *consumed /* bytes consumed */
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000795 );
796
Mark Hammond91a681d2002-08-12 07:21:58 +0000797PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 const Py_UNICODE *data, /* Unicode char buffer */
799 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
800 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
801 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
802 const char *errors /* error handling */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000803 );
804
Guido van Rossumd8225182000-03-10 22:33:05 +0000805/* --- UTF-8 Codecs ------------------------------------------------------- */
806
Mark Hammond91a681d2002-08-12 07:21:58 +0000807PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 const char *string, /* UTF-8 encoded string */
809 Py_ssize_t length, /* size of string */
810 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000811 );
812
Walter Dörwald69652032004-09-07 20:24:22 +0000813PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 const char *string, /* UTF-8 encoded string */
815 Py_ssize_t length, /* size of string */
816 const char *errors, /* error handling */
817 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000818 );
819
Mark Hammond91a681d2002-08-12 07:21:58 +0000820PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 );
823
Mark Hammond91a681d2002-08-12 07:21:58 +0000824PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 const Py_UNICODE *data, /* Unicode char buffer */
826 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
827 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000828 );
829
Walter Dörwald41980ca2007-08-16 21:55:45 +0000830/* --- UTF-32 Codecs ------------------------------------------------------ */
831
832/* Decodes length bytes from a UTF-32 encoded buffer string and returns
833 the corresponding Unicode object.
834
835 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 to "strict".
Walter Dörwald41980ca2007-08-16 21:55:45 +0000837
838 If byteorder is non-NULL, the decoder starts decoding using the
839 given byte order:
840
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 *byteorder == -1: little endian
842 *byteorder == 0: native order
843 *byteorder == 1: big endian
Walter Dörwald41980ca2007-08-16 21:55:45 +0000844
845 In native mode, the first four bytes of the stream are checked for a
846 BOM mark. If found, the BOM mark is analysed, the byte order
847 adjusted and the BOM skipped. In the other modes, no BOM mark
848 interpretation is done. After completion, *byteorder is set to the
849 current byte order at the end of input data.
850
851 If byteorder is NULL, the codec starts in native order mode.
852
853*/
854
855PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856 const char *string, /* UTF-32 encoded string */
857 Py_ssize_t length, /* size of string */
858 const char *errors, /* error handling */
859 int *byteorder /* pointer to byteorder to use
860 0=native;-1=LE,1=BE; updated on
861 exit */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000862 );
863
864PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865 const char *string, /* UTF-32 encoded string */
866 Py_ssize_t length, /* size of string */
867 const char *errors, /* error handling */
868 int *byteorder, /* pointer to byteorder to use
869 0=native;-1=LE,1=BE; updated on
870 exit */
871 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000872 );
873
874/* Returns a Python string using the UTF-32 encoding in native byte
875 order. The string always starts with a BOM mark. */
876
877PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000878 PyObject *unicode /* Unicode object */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000879 );
880
881/* Returns a Python string object holding the UTF-32 encoded value of
882 the Unicode data.
883
884 If byteorder is not 0, output is written according to the following
885 byte order:
886
887 byteorder == -1: little endian
888 byteorder == 0: native byte order (writes a BOM mark)
889 byteorder == 1: big endian
890
891 If byteorder is 0, the output string will always start with the
892 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
893 prepended.
894
895*/
896
897PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 const Py_UNICODE *data, /* Unicode char buffer */
899 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
900 const char *errors, /* error handling */
901 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Walter Dörwald41980ca2007-08-16 21:55:45 +0000902 );
903
Guido van Rossumd8225182000-03-10 22:33:05 +0000904/* --- UTF-16 Codecs ------------------------------------------------------ */
905
Guido van Rossum9e896b32000-04-05 20:11:21 +0000906/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000907 the corresponding Unicode object.
908
909 errors (if non-NULL) defines the error handling. It defaults
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910 to "strict".
Guido van Rossumd8225182000-03-10 22:33:05 +0000911
912 If byteorder is non-NULL, the decoder starts decoding using the
913 given byte order:
914
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 *byteorder == -1: little endian
916 *byteorder == 0: native order
917 *byteorder == 1: big endian
Guido van Rossumd8225182000-03-10 22:33:05 +0000918
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000919 In native mode, the first two bytes of the stream are checked for a
920 BOM mark. If found, the BOM mark is analysed, the byte order
921 adjusted and the BOM skipped. In the other modes, no BOM mark
922 interpretation is done. After completion, *byteorder is set to the
923 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000924
925 If byteorder is NULL, the codec starts in native order mode.
926
927*/
928
Mark Hammond91a681d2002-08-12 07:21:58 +0000929PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930 const char *string, /* UTF-16 encoded string */
931 Py_ssize_t length, /* size of string */
932 const char *errors, /* error handling */
933 int *byteorder /* pointer to byteorder to use
934 0=native;-1=LE,1=BE; updated on
935 exit */
Guido van Rossumd8225182000-03-10 22:33:05 +0000936 );
937
Walter Dörwald69652032004-09-07 20:24:22 +0000938PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000939 const char *string, /* UTF-16 encoded string */
940 Py_ssize_t length, /* size of string */
941 const char *errors, /* error handling */
942 int *byteorder, /* pointer to byteorder to use
943 0=native;-1=LE,1=BE; updated on
944 exit */
945 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000946 );
947
Guido van Rossumd8225182000-03-10 22:33:05 +0000948/* Returns a Python string using the UTF-16 encoding in native byte
949 order. The string always starts with a BOM mark. */
950
Mark Hammond91a681d2002-08-12 07:21:58 +0000951PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000952 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000953 );
954
955/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000956 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000957
958 If byteorder is not 0, output is written according to the following
959 byte order:
960
961 byteorder == -1: little endian
962 byteorder == 0: native byte order (writes a BOM mark)
963 byteorder == 1: big endian
964
965 If byteorder is 0, the output string will always start with the
966 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
967 prepended.
968
969 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
970 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000971 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000972
973*/
974
Mark Hammond91a681d2002-08-12 07:21:58 +0000975PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000976 const Py_UNICODE *data, /* Unicode char buffer */
977 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
978 const char *errors, /* error handling */
979 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Guido van Rossumd8225182000-03-10 22:33:05 +0000980 );
981
982/* --- Unicode-Escape Codecs ---------------------------------------------- */
983
Mark Hammond91a681d2002-08-12 07:21:58 +0000984PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000985 const char *string, /* Unicode-Escape encoded string */
986 Py_ssize_t length, /* size of string */
987 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +0000988 );
989
Mark Hammond91a681d2002-08-12 07:21:58 +0000990PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +0000992 );
993
Mark Hammond91a681d2002-08-12 07:21:58 +0000994PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000995 const Py_UNICODE *data, /* Unicode char buffer */
996 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000997 );
998
999/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1000
Mark Hammond91a681d2002-08-12 07:21:58 +00001001PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001002 const char *string, /* Raw-Unicode-Escape encoded string */
1003 Py_ssize_t length, /* size of string */
1004 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001005 );
1006
Mark Hammond91a681d2002-08-12 07:21:58 +00001007PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001009 );
1010
Mark Hammond91a681d2002-08-12 07:21:58 +00001011PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 const Py_UNICODE *data, /* Unicode char buffer */
1013 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001014 );
1015
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001016/* --- Unicode Internal Codec ---------------------------------------------
1017
1018 Only for internal use in _codecsmodule.c */
1019
1020PyObject *_PyUnicode_DecodeUnicodeInternal(
1021 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001023 const char *errors
1024 );
1025
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001026/* --- Latin-1 Codecs -----------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001027
1028 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1029
1030*/
1031
Mark Hammond91a681d2002-08-12 07:21:58 +00001032PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001033 const char *string, /* Latin-1 encoded string */
1034 Py_ssize_t length, /* size of string */
1035 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001036 );
1037
Mark Hammond91a681d2002-08-12 07:21:58 +00001038PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001039 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 );
1041
Mark Hammond91a681d2002-08-12 07:21:58 +00001042PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 const Py_UNICODE *data, /* Unicode char buffer */
1044 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1045 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001046 );
1047
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048/* --- ASCII Codecs -------------------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001049
1050 Only 7-bit ASCII data is excepted. All other codes generate errors.
1051
1052*/
1053
Mark Hammond91a681d2002-08-12 07:21:58 +00001054PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 const char *string, /* ASCII encoded string */
1056 Py_ssize_t length, /* size of string */
1057 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001058 );
1059
Mark Hammond91a681d2002-08-12 07:21:58 +00001060PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 PyObject *unicode /* Unicode object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001062 );
1063
Mark Hammond91a681d2002-08-12 07:21:58 +00001064PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 const Py_UNICODE *data, /* Unicode char buffer */
1066 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1067 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001068 );
1069
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070/* --- Character Map Codecs -----------------------------------------------
Guido van Rossumd8225182000-03-10 22:33:05 +00001071
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072 This codec uses mappings to encode and decode characters.
Guido van Rossumd8225182000-03-10 22:33:05 +00001073
1074 Decoding mappings must map single string characters to single
1075 Unicode characters, integers (which are then interpreted as Unicode
1076 ordinals) or None (meaning "undefined mapping" and causing an
1077 error).
1078
1079 Encoding mappings must map single Unicode characters to single
1080 string characters, integers (which are then interpreted as Latin-1
1081 ordinals) or None (meaning "undefined mapping" and causing an
1082 error).
1083
1084 If a character lookup fails with a LookupError, the character is
1085 copied as-is meaning that its ordinal value will be interpreted as
1086 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1087 to contain those mappings which map characters to different code
1088 points.
1089
1090*/
1091
Mark Hammond91a681d2002-08-12 07:21:58 +00001092PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001093 const char *string, /* Encoded string */
1094 Py_ssize_t length, /* size of string */
1095 PyObject *mapping, /* character mapping
1096 (char ordinal -> unicode ordinal) */
1097 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001098 );
1099
Mark Hammond91a681d2002-08-12 07:21:58 +00001100PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 PyObject *unicode, /* Unicode object */
1102 PyObject *mapping /* character mapping
1103 (unicode ordinal -> char ordinal) */
Guido van Rossumd8225182000-03-10 22:33:05 +00001104 );
1105
Mark Hammond91a681d2002-08-12 07:21:58 +00001106PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 const Py_UNICODE *data, /* Unicode char buffer */
1108 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1109 PyObject *mapping, /* character mapping
1110 (unicode ordinal -> char ordinal) */
1111 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001112 );
1113
1114/* Translate a Py_UNICODE buffer of the given length by applying a
1115 character mapping table to it and return the resulting Unicode
1116 object.
1117
1118 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001120
1121 Mapping tables may be dictionaries or sequences. Unmapped character
1122 ordinals (ones which cause a LookupError) are left untouched and
1123 are copied as-is.
1124
1125*/
1126
Mark Hammond91a681d2002-08-12 07:21:58 +00001127PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 const Py_UNICODE *data, /* Unicode char buffer */
1129 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1130 PyObject *table, /* Translate table */
1131 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001132 );
1133
Guido van Rossumefec1152000-03-28 02:01:15 +00001134#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +00001135
Guido van Rossumefec1152000-03-28 02:01:15 +00001136/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001137
Mark Hammond91a681d2002-08-12 07:21:58 +00001138PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001139 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001141 const char *errors /* error handling */
1142 );
1143
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001144PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1145 const char *string, /* MBCS encoded string */
1146 Py_ssize_t length, /* size of string */
1147 const char *errors, /* error handling */
1148 Py_ssize_t *consumed /* bytes consumed */
1149 );
1150
Mark Hammond91a681d2002-08-12 07:21:58 +00001151PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001152 PyObject *unicode /* Unicode object */
1153 );
1154
Mark Hammond91a681d2002-08-12 07:21:58 +00001155PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001156 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001157 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001158 const char *errors /* error handling */
1159 );
1160
Guido van Rossumefec1152000-03-28 02:01:15 +00001161#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001162
Guido van Rossum9e896b32000-04-05 20:11:21 +00001163/* --- Decimal Encoder ---------------------------------------------------- */
1164
1165/* Takes a Unicode string holding a decimal value and writes it into
1166 an output buffer using standard ASCII digit codes.
1167
1168 The output buffer has to provide at least length+1 bytes of storage
1169 area. The output string is 0-terminated.
1170
1171 The encoder converts whitespace to ' ', decimal characters to their
1172 corresponding ASCII digit and all other Latin-1 characters except
1173 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1174 are treated as errors. This includes embedded NULL bytes.
1175
1176 Error handling is defined by the errors argument:
1177
1178 NULL or "strict": raise a ValueError
1179 "ignore": ignore the wrong characters (these are not copied to the
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 output buffer)
Guido van Rossum9e896b32000-04-05 20:11:21 +00001181 "replace": replaces illegal characters with '?'
1182
1183 Returns 0 on success, -1 on failure.
1184
1185*/
1186
Mark Hammond91a681d2002-08-12 07:21:58 +00001187PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001188 Py_UNICODE *s, /* Unicode buffer */
1189 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1190 char *output, /* Output buffer; must have size >= length */
1191 const char *errors /* error handling */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001192 );
1193
Martin v. Löwis011e8422009-05-05 04:43:17 +00001194/* --- File system encoding ---------------------------------------------- */
1195
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001196/* ParseTuple converter: encode str objects to bytes using
1197 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001198
1199PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1200
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001201/* ParseTuple converter: decode bytes objects to unicode using
1202 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1203
1204PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1205
Victor Stinner77c38622010-05-14 15:58:55 +00001206/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1207 and the "surrogateescape" error handler.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001208
Victor Stinner77c38622010-05-14 15:58:55 +00001209 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001210
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001211 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
Martin v. Löwis011e8422009-05-05 04:43:17 +00001212*/
1213
1214PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1215 const char *s /* encoded string */
1216 );
1217
Victor Stinner77c38622010-05-14 15:58:55 +00001218/* Decode a string using Py_FileSystemDefaultEncoding
1219 and the "surrogateescape" error handler.
1220
1221 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1222*/
1223
Martin v. Löwis011e8422009-05-05 04:43:17 +00001224PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1225 const char *s, /* encoded string */
1226 Py_ssize_t size /* size */
1227 );
1228
Victor Stinnerae6265f2010-05-15 16:27:27 +00001229/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Petersonccbd6942010-05-15 17:43:18 +00001230 "surrogateescape" error handler, and return bytes.
Victor Stinnerae6265f2010-05-15 16:27:27 +00001231
1232 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1233*/
1234
1235PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1236 PyObject *unicode
1237 );
1238
Guido van Rossumd8225182000-03-10 22:33:05 +00001239/* --- Methods & Slots ----------------------------------------------------
1240
1241 These are capable of handling Unicode objects and strings on input
1242 (we refer to them as strings in the descriptions) and return
1243 Unicode objects or integers as apporpriate. */
1244
1245/* Concat two strings giving a new Unicode string. */
1246
Mark Hammond91a681d2002-08-12 07:21:58 +00001247PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 PyObject *left, /* Left string */
1249 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001250 );
1251
Walter Dörwald1ab83302007-05-18 17:15:44 +00001252/* Concat two strings and put the result in *pleft
1253 (sets *pleft to NULL on error) */
1254
1255PyAPI_FUNC(void) PyUnicode_Append(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001256 PyObject **pleft, /* Pointer to left string */
1257 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001258 );
1259
1260/* Concat two strings, put the result in *pleft and drop the right object
1261 (sets *pleft to NULL on error) */
1262
1263PyAPI_FUNC(void) PyUnicode_AppendAndDel(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 PyObject **pleft, /* Pointer to left string */
1265 PyObject *right /* Right string */
Walter Dörwald1ab83302007-05-18 17:15:44 +00001266 );
1267
Guido van Rossumd8225182000-03-10 22:33:05 +00001268/* Split a string giving a list of Unicode strings.
1269
1270 If sep is NULL, splitting will be done at all whitespace
1271 substrings. Otherwise, splits occur at the given separator.
1272
1273 At most maxsplit splits will be done. If negative, no limit is set.
1274
1275 Separators are not included in the resulting list.
1276
1277*/
1278
Mark Hammond91a681d2002-08-12 07:21:58 +00001279PyAPI_FUNC(PyObject*) PyUnicode_Split(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 PyObject *s, /* String to split */
1281 PyObject *sep, /* String separator */
1282 Py_ssize_t maxsplit /* Maxsplit count */
1283 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001284
1285/* Dito, but split at line breaks.
1286
1287 CRLF is considered to be one line break. Line breaks are not
1288 included in the resulting list. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001289
Mark Hammond91a681d2002-08-12 07:21:58 +00001290PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291 PyObject *s, /* String to split */
1292 int keepends /* If true, line end markers are included */
1293 );
Guido van Rossumd8225182000-03-10 22:33:05 +00001294
Thomas Wouters477c8d52006-05-27 19:21:47 +00001295/* Partition a string using a given separator. */
1296
1297PyAPI_FUNC(PyObject*) PyUnicode_Partition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 PyObject *s, /* String to partition */
1299 PyObject *sep /* String separator */
1300 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001301
1302/* Partition a string using a given separator, searching from the end of the
1303 string. */
1304
1305PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001306 PyObject *s, /* String to partition */
1307 PyObject *sep /* String separator */
1308 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00001309
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001310/* Split a string giving a list of Unicode strings.
1311
1312 If sep is NULL, splitting will be done at all whitespace
1313 substrings. Otherwise, splits occur at the given separator.
1314
1315 At most maxsplit splits will be done. But unlike PyUnicode_Split
1316 PyUnicode_RSplit splits from the end of the string. If negative,
1317 no limit is set.
1318
1319 Separators are not included in the resulting list.
1320
1321*/
1322
1323PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001324 PyObject *s, /* String to split */
1325 PyObject *sep, /* String separator */
1326 Py_ssize_t maxsplit /* Maxsplit count */
1327 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001328
Guido van Rossumd8225182000-03-10 22:33:05 +00001329/* Translate a string by applying a character mapping table to it and
1330 return the resulting Unicode object.
1331
1332 The mapping table must map Unicode ordinal integers to Unicode
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 ordinal integers or None (causing deletion of the character).
Guido van Rossumd8225182000-03-10 22:33:05 +00001334
1335 Mapping tables may be dictionaries or sequences. Unmapped character
1336 ordinals (ones which cause a LookupError) are left untouched and
1337 are copied as-is.
1338
1339*/
1340
Mark Hammond91a681d2002-08-12 07:21:58 +00001341PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 PyObject *str, /* String */
1343 PyObject *table, /* Translate table */
1344 const char *errors /* error handling */
Guido van Rossumd8225182000-03-10 22:33:05 +00001345 );
1346
1347/* Join a sequence of strings using the given separator and return
1348 the resulting Unicode string. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject*) PyUnicode_Join(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001351 PyObject *separator, /* Separator string */
1352 PyObject *seq /* Sequence object */
Guido van Rossumd8225182000-03-10 22:33:05 +00001353 );
1354
1355/* Return 1 if substr matches str[start:end] at the given tail end, 0
1356 otherwise. */
1357
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 PyObject *str, /* String */
1360 PyObject *substr, /* Prefix or Suffix string */
1361 Py_ssize_t start, /* Start index */
1362 Py_ssize_t end, /* Stop index */
1363 int direction /* Tail end: -1 prefix, +1 suffix */
Guido van Rossumd8225182000-03-10 22:33:05 +00001364 );
1365
1366/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001367 given search direction or -1 if not found. -2 is returned in case
1368 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001369
Martin v. Löwis18e16552006-02-15 17:27:45 +00001370PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 PyObject *str, /* String */
1372 PyObject *substr, /* Substring to find */
1373 Py_ssize_t start, /* Start index */
1374 Py_ssize_t end, /* Stop index */
1375 int direction /* Find direction: +1 forward, -1 backward */
Guido van Rossumd8225182000-03-10 22:33:05 +00001376 );
1377
Barry Warsaw51ac5802000-03-20 16:36:48 +00001378/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001379
Martin v. Löwis18e16552006-02-15 17:27:45 +00001380PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 PyObject *str, /* String */
1382 PyObject *substr, /* Substring to count */
1383 Py_ssize_t start, /* Start index */
1384 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001385 );
1386
Barry Warsaw51ac5802000-03-20 16:36:48 +00001387/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001388 and return the resulting Unicode object. */
1389
Mark Hammond91a681d2002-08-12 07:21:58 +00001390PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 PyObject *str, /* String */
1392 PyObject *substr, /* Substring to find */
1393 PyObject *replstr, /* Substring to replace */
1394 Py_ssize_t maxcount /* Max. number of replacements to apply;
1395 -1 = all */
Guido van Rossumd8225182000-03-10 22:33:05 +00001396 );
1397
1398/* Compare two strings and return -1, 0, 1 for less than, equal,
1399 greater than resp. */
1400
Mark Hammond91a681d2002-08-12 07:21:58 +00001401PyAPI_FUNC(int) PyUnicode_Compare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001402 PyObject *left, /* Left string */
1403 PyObject *right /* Right string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001404 );
1405
Martin v. Löwis5b222132007-06-10 09:51:05 +00001406PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1407 PyObject *left,
1408 const char *right
1409 );
1410
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001411/* Rich compare two strings and return one of the following:
1412
1413 - NULL in case an exception was raised
1414 - Py_True or Py_False for successfuly comparisons
1415 - Py_NotImplemented in case the type combination is unknown
1416
1417 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1418 case the conversion of the arguments to Unicode fails with a
1419 UnicodeDecodeError.
1420
1421 Possible values for op:
1422
1423 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1424
1425*/
1426
1427PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001428 PyObject *left, /* Left string */
1429 PyObject *right, /* Right string */
1430 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001431 );
1432
Thomas Wouters7e474022000-07-16 12:04:32 +00001433/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001434 the resulting Unicode string. */
1435
Mark Hammond91a681d2002-08-12 07:21:58 +00001436PyAPI_FUNC(PyObject *) PyUnicode_Format(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 PyObject *format, /* Format string */
1438 PyObject *args /* Argument tuple or dictionary */
Guido van Rossumd8225182000-03-10 22:33:05 +00001439 );
1440
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001441/* Checks whether element is contained in container and return 1/0
1442 accordingly.
1443
1444 element has to coerce to an one element Unicode string. -1 is
1445 returned in case of an error. */
1446
Mark Hammond91a681d2002-08-12 07:21:58 +00001447PyAPI_FUNC(int) PyUnicode_Contains(
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001448 PyObject *container, /* Container string */
1449 PyObject *element /* Element string */
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001450 );
1451
Martin v. Löwis47383402007-08-15 07:32:56 +00001452/* Checks whether argument is a valid identifier. */
1453
1454PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1455
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001456/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001457PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001458 PyUnicodeObject *self,
1459 int striptype,
1460 PyObject *sepobj
1461 );
1462
Eric Smith5807c412008-05-11 21:00:57 +00001463/* Using the current locale, insert the thousands grouping
1464 into the string pointed to by buffer. For the argument descriptions,
1465 see Objects/stringlib/localeutil.h */
1466
Eric Smith0923d1d2009-04-16 20:16:10 +00001467PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1468 Py_ssize_t n_buffer,
1469 Py_UNICODE *digits,
1470 Py_ssize_t n_digits,
1471 Py_ssize_t min_width);
Eric Smith5807c412008-05-11 21:00:57 +00001472
Eric Smitha3b1ac82009-04-03 14:45:06 +00001473/* Using explicit passed-in values, insert the thousands grouping
1474 into the string pointed to by buffer. For the argument descriptions,
1475 see Objects/stringlib/localeutil.h */
Eric Smith0923d1d2009-04-16 20:16:10 +00001476PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1477 Py_ssize_t n_buffer,
1478 Py_UNICODE *digits,
1479 Py_ssize_t n_digits,
1480 Py_ssize_t min_width,
1481 const char *grouping,
1482 const char *thousands_sep);
Guido van Rossumd8225182000-03-10 22:33:05 +00001483/* === Characters Type APIs =============================================== */
1484
Benjamin Peterson960cf0f2009-01-09 04:11:44 +00001485/* Helper array used by Py_UNICODE_ISSPACE(). */
1486
1487PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1488
Guido van Rossumd8225182000-03-10 22:33:05 +00001489/* These should not be used directly. Use the Py_UNICODE_IS* and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001490 Py_UNICODE_TO* macros instead.
Guido van Rossumd8225182000-03-10 22:33:05 +00001491
1492 These APIs are implemented in Objects/unicodectype.c.
1493
1494*/
1495
Mark Hammond91a681d2002-08-12 07:21:58 +00001496PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001497 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001498 );
1499
Mark Hammond91a681d2002-08-12 07:21:58 +00001500PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001501 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001502 );
1503
Mark Hammond91a681d2002-08-12 07:21:58 +00001504PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001505 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001506 );
1507
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001508PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001509 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001510 );
1511
1512PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001513 Py_UCS4 ch /* Unicode character */
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001514 );
1515
Mark Hammond91a681d2002-08-12 07:21:58 +00001516PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001517 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001518 );
1519
Mark Hammond91a681d2002-08-12 07:21:58 +00001520PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001521 const Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001522 );
1523
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001524PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1525 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001526 );
1527
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001528PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1529 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001530 );
1531
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001532PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1533 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001534 );
1535
Mark Hammond91a681d2002-08-12 07:21:58 +00001536PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001537 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001538 );
1539
Mark Hammond91a681d2002-08-12 07:21:58 +00001540PyAPI_FUNC(int) _PyUnicode_ToDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001541 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001542 );
1543
Mark Hammond91a681d2002-08-12 07:21:58 +00001544PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001545 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001546 );
1547
Mark Hammond91a681d2002-08-12 07:21:58 +00001548PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001549 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001550 );
1551
Mark Hammond91a681d2002-08-12 07:21:58 +00001552PyAPI_FUNC(int) _PyUnicode_IsDigit(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001553 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001554 );
1555
Mark Hammond91a681d2002-08-12 07:21:58 +00001556PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001557 Py_UCS4 ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001558 );
1559
Georg Brandl559e5d72008-06-11 18:37:52 +00001560PyAPI_FUNC(int) _PyUnicode_IsPrintable(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001561 Py_UCS4 ch /* Unicode character */
Georg Brandl559e5d72008-06-11 18:37:52 +00001562 );
1563
Mark Hammond91a681d2002-08-12 07:21:58 +00001564PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +00001565 Py_UCS4 ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001566 );
1567
Victor Stinneref8d95c2010-08-16 22:03:11 +00001568PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1569 const Py_UNICODE *u
1570 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001571
1572PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001573 Py_UNICODE *s1,
1574 const Py_UNICODE *s2);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001575
Victor Stinnerc4eb7652010-09-01 23:43:50 +00001576PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1577 Py_UNICODE *s1, const Py_UNICODE *s2);
1578
Martin v. Löwis5b222132007-06-10 09:51:05 +00001579PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001580 Py_UNICODE *s1,
1581 const Py_UNICODE *s2,
1582 size_t n);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001583
1584PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001585 const Py_UNICODE *s1,
1586 const Py_UNICODE *s2
1587 );
1588
1589PyAPI_FUNC(int) Py_UNICODE_strncmp(
1590 const Py_UNICODE *s1,
1591 const Py_UNICODE *s2,
1592 size_t n
1593 );
Martin v. Löwis5b222132007-06-10 09:51:05 +00001594
1595PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001596 const Py_UNICODE *s,
1597 Py_UNICODE c
Martin v. Löwis5b222132007-06-10 09:51:05 +00001598 );
1599
Victor Stinner331ea922010-08-10 16:37:20 +00001600PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinneref8d95c2010-08-16 22:03:11 +00001601 const Py_UNICODE *s,
1602 Py_UNICODE c
Victor Stinner331ea922010-08-10 16:37:20 +00001603 );
1604
Guido van Rossumd8225182000-03-10 22:33:05 +00001605#ifdef __cplusplus
1606}
1607#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001608#endif /* !Py_UNICODEOBJECT_H */