blob: 203dcef09fd9abf87d6ebd0cc562f9586920ba76 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000055#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000056
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000061/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
62 properly set, but the default rules below doesn't set it. I'll
63 sort this out some other day -- fredrik@pythonware.com */
64
65#ifndef Py_UNICODE_SIZE
66#error Must define Py_UNICODE_SIZE
67#endif
68
Fredrik Lundh8f455852001-06-27 18:59:43 +000069/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
70 strings are stored as UCS-2 (with limited support for UTF-16) */
71
72#if Py_UNICODE_SIZE >= 4
73#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000074#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000075
Guido van Rossumd8225182000-03-10 22:33:05 +000076/* Set these flags if the platform has "wchar.h", "wctype.h" and the
77 wchar_t type is a 16-bit unsigned type */
78/* #define HAVE_WCHAR_H */
79/* #define HAVE_USABLE_WCHAR_T */
80
81/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000082#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000083
Fredrik Lundh1294ad02001-06-26 17:17:07 +000084/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000085# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000086# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000087# define PY_UNICODE_TYPE wchar_t
88# endif
89
Fredrik Lundh8f455852001-06-27 18:59:43 +000090# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000091# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000092# endif
93
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97 through the interface functions PyUnicode_FromWideChar() and
98 PyUnicode_AsWideChar(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109# include <time.h>
110# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000111# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000112#endif
113
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000114/*
115 * Use this typedef when you need to represent a UTF-16 surrogate pair
116 * as single unsigned integer.
117 */
118#if SIZEOF_INT >= 4
119typedef unsigned int Py_UCS4;
120#elif SIZEOF_LONG >= 4
121typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000122#endif
123
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000124typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000125
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
127
128/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
129 produce different external names and thus cause import errors in
130 case Python interpreters and extensions with mixed compiled in
131 Unicode width assumptions are combined. */
132
133#ifndef Py_UNICODE_WIDE
134
135# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
136# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000137# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000138# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
139# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
140# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000141# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000142# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
143# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
144# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
145# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
146# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
147# define PyUnicode_Compare PyUnicodeUCS2_Compare
148# define PyUnicode_Concat PyUnicodeUCS2_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000149# define PyUnicode_Append PyUnicodeUCS2_Append
150# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000151# define PyUnicode_Contains PyUnicodeUCS2_Contains
152# define PyUnicode_Count PyUnicodeUCS2_Count
153# define PyUnicode_Decode PyUnicodeUCS2_Decode
154# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
155# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
156# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000157# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000158# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000159# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000160# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
161# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000162# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000163# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000164# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000165# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000166# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
167# define PyUnicode_Encode PyUnicodeUCS2_Encode
168# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
169# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
170# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
171# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
172# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000173# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000174# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
175# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
176# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
177# define PyUnicode_Find PyUnicodeUCS2_Find
178# define PyUnicode_Format PyUnicodeUCS2_Format
179# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
180# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000181# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000182# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000183# define PyUnicode_FromString PyUnicodeUCS2_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000184# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
185# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
186# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
Walter Dörwald14176a52007-05-18 17:04:42 +0000187# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000188# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
189# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
190# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000191# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000192# define PyUnicode_Join PyUnicodeUCS2_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193# define PyUnicode_Partition PyUnicodeUCS2_Partition
194# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
195# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000196# define PyUnicode_Replace PyUnicodeUCS2_Replace
197# define PyUnicode_Resize PyUnicodeUCS2_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000198# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000199# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
200# define PyUnicode_Split PyUnicodeUCS2_Split
201# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
202# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
203# define PyUnicode_Translate PyUnicodeUCS2_Translate
204# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
205# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
206# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
207# define _PyUnicode_Init _PyUnicodeUCS2_Init
208# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
209# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
210# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
211# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
212# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
213# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
214# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000215# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
216# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000217# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
218# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
219# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
220# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
221# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
222# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
223# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
224# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
225
226#else
227
228# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
229# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000230# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000231# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
232# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
233# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000234# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000235# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
236# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
237# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
238# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
239# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
240# define PyUnicode_Compare PyUnicodeUCS4_Compare
241# define PyUnicode_Concat PyUnicodeUCS4_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000242# define PyUnicode_Append PyUnicodeUCS4_Append
243# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000244# define PyUnicode_Contains PyUnicodeUCS4_Contains
245# define PyUnicode_Count PyUnicodeUCS4_Count
246# define PyUnicode_Decode PyUnicodeUCS4_Decode
247# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
248# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
249# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000250# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000251# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000252# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000253# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
254# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000255# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000256# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000257# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000258# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000259# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
260# define PyUnicode_Encode PyUnicodeUCS4_Encode
261# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
262# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
263# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
264# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
265# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000266# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000267# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
268# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
269# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
270# define PyUnicode_Find PyUnicodeUCS4_Find
271# define PyUnicode_Format PyUnicodeUCS4_Format
272# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
273# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000274# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000275# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000276# define PyUnicode_FromString PyUnicodeUCS4_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000277# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
278# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
279# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000280# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
281# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
282# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
283# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000284# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000285# define PyUnicode_Join PyUnicodeUCS4_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000286# define PyUnicode_Partition PyUnicodeUCS4_Partition
287# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
288# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000289# define PyUnicode_Replace PyUnicodeUCS4_Replace
290# define PyUnicode_Resize PyUnicodeUCS4_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000291# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000292# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
293# define PyUnicode_Split PyUnicodeUCS4_Split
294# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
295# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
296# define PyUnicode_Translate PyUnicodeUCS4_Translate
297# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
298# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
299# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
300# define _PyUnicode_Init _PyUnicodeUCS4_Init
301# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
302# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
303# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
304# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
305# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
306# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
307# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000308# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
309# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000310# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
311# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
312# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
313# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
314# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
315# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
316# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
317# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
318
319
320#endif
321
Guido van Rossumd8225182000-03-10 22:33:05 +0000322/* --- Internal Unicode Operations ---------------------------------------- */
323
324/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000325 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
Raymond Hettinger57341c32004-10-31 05:46:59 +0000326 configure Python using --with-wctype-functions. This reduces the
Barry Warsaw51ac5802000-03-20 16:36:48 +0000327 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000328
329#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
330
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000331#include <wctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000332
333#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
334
335#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
336#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
337#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
338#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
339
340#define Py_UNICODE_TOLOWER(ch) towlower(ch)
341#define Py_UNICODE_TOUPPER(ch) towupper(ch)
342#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
343
344#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
345#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
346#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
347
348#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
349#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
350#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
351
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000352#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354#else
355
356#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
357
358#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
359#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
360#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
361#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
362
363#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
364#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
365#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
366
367#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
368#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
369#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
370
371#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
372#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
373#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
374
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000375#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000376
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000377#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000378
379#define Py_UNICODE_ISALNUM(ch) \
380 (Py_UNICODE_ISALPHA(ch) || \
381 Py_UNICODE_ISDECIMAL(ch) || \
382 Py_UNICODE_ISDIGIT(ch) || \
383 Py_UNICODE_ISNUMERIC(ch))
384
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000385#define Py_UNICODE_COPY(target, source, length) \
386 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000387
388#define Py_UNICODE_FILL(target, value, length) do\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000389 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
390 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
391 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000392
Thomas Wouters477c8d52006-05-27 19:21:47 +0000393/* check if substring matches at given offset. the offset must be
394 valid, and the substring must not be empty */
395#define Py_UNICODE_MATCH(string, offset, substring) \
396 ((*((string)->str + (offset)) == *((substring)->str)) && \
397 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
398 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000399
Barry Warsaw51ac5802000-03-20 16:36:48 +0000400#ifdef __cplusplus
401extern "C" {
402#endif
403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404/* --- Unicode Type ------------------------------------------------------- */
405
406typedef struct {
407 PyObject_HEAD
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t length; /* Length of raw Unicode data in buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000409 Py_UNICODE *str; /* Raw Unicode buffer */
410 long hash; /* Hash value; -1 if not set */
Walter Dörwald16807132007-05-25 13:52:07 +0000411 int state; /* != 0 if interned. In this case the two
412 * references from the dictionary to this object
413 * are *not* counted in ob_refcnt. */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000414 PyObject *defenc; /* (Default) Encoded version as Python
415 string, or NULL; this is used for
416 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000417} PyUnicodeObject;
418
Mark Hammond91a681d2002-08-12 07:21:58 +0000419PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000420
Walter Dörwald16807132007-05-25 13:52:07 +0000421#define SSTATE_NOT_INTERNED 0
422#define SSTATE_INTERNED_MORTAL 1
423#define SSTATE_INTERNED_IMMORTAL 2
424
Thomas Wouters27d517b2007-02-25 20:39:11 +0000425#define PyUnicode_Check(op) \
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000426 PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS)
427#define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000428
429/* Fast access macros */
430#define PyUnicode_GET_SIZE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000431 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
Guido van Rossumd8225182000-03-10 22:33:05 +0000432#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000433 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000434#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000435 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000436#define PyUnicode_AS_DATA(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000437 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000438
439/* --- Constants ---------------------------------------------------------- */
440
441/* This Unicode character will be used as replacement character during
442 decoding if the errors argument is set to "replace". Note: the
443 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
444 Unicode 3.0. */
445
446#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
447
448/* === Public API ========================================================= */
449
450/* --- Plain Py_UNICODE --------------------------------------------------- */
451
452/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 size.
454
455 u may be NULL which causes the contents to be undefined. It is the
456 user's responsibility to fill in the needed data afterwards. Note
457 that modifying the Unicode object contents after construction is
458 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000459
460 The buffer is copied into the new object. */
461
Mark Hammond91a681d2002-08-12 07:21:58 +0000462PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000463 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000464 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000465 );
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
468PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
469 const char *u, /* char buffer */
470 Py_ssize_t size /* size of buffer */
471 );
472
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000473/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
474 Latin-1 encoded bytes */
475PyAPI_FUNC(PyObject*) PyUnicode_FromString(
476 const char *u /* string */
477 );
478
Guido van Rossumd8225182000-03-10 22:33:05 +0000479/* Return a read-only pointer to the Unicode object's internal
480 Py_UNICODE buffer. */
481
Mark Hammond91a681d2002-08-12 07:21:58 +0000482PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000483 PyObject *unicode /* Unicode object */
484 );
485
486/* Get the length of the Unicode object. */
487
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Guido van Rossumd8225182000-03-10 22:33:05 +0000489 PyObject *unicode /* Unicode object */
490 );
491
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000492/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000493PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000494
Guido van Rossum52c23592000-04-10 13:41:41 +0000495/* Resize an already allocated Unicode object to the new size length.
496
497 *unicode is modified to point to the new (resized) object and 0
498 returned on success.
499
500 This API may only be called by the function which also called the
501 Unicode constructor. The refcount on the object must be 1. Otherwise,
502 an error is returned.
503
504 Error handling is implemented as follows: an exception is set, -1
505 is returned and *unicode left untouched.
506
507*/
508
Mark Hammond91a681d2002-08-12 07:21:58 +0000509PyAPI_FUNC(int) PyUnicode_Resize(
Guido van Rossum52c23592000-04-10 13:41:41 +0000510 PyObject **unicode, /* Pointer to the Unicode object */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000512 );
513
Guido van Rossumd8225182000-03-10 22:33:05 +0000514/* Coerce obj to an Unicode object and return a reference with
515 *incremented* refcount.
516
517 Coercion is done in the following way:
518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519 1. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000520 under the assumptions that they contain data using the current
521 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000522
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000523 2. All other objects (including Unicode objects) raise an
524 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000525
526 The API returns NULL in case of an error. The caller is responsible
527 for decref'ing the returned objects.
528
529*/
530
Mark Hammond91a681d2002-08-12 07:21:58 +0000531PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000532 register PyObject *obj, /* Object */
533 const char *encoding, /* encoding */
534 const char *errors /* error handling */
535 );
536
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000538 *incremented* refcount.
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539
540 Unicode objects are passed back as-is (subclasses are converted to
541 true Unicode objects), all other objects are delegated to
542 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
543 using the default encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000544
545 The API returns NULL in case of an error. The caller is responsible
546 for decref'ing the returned objects.
547
548*/
549
Mark Hammond91a681d2002-08-12 07:21:58 +0000550PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Guido van Rossumd8225182000-03-10 22:33:05 +0000551 register PyObject *obj /* Object */
552 );
553
Walter Dörwaldd2034312007-05-18 16:29:38 +0000554PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
555PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
556
Walter Dörwald16807132007-05-25 13:52:07 +0000557PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
558PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
559PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
560PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
561
562/* Use only if you know it's a string */
563#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
564
Guido van Rossumd8225182000-03-10 22:33:05 +0000565/* --- wchar_t support for platforms which support it --------------------- */
566
567#ifdef HAVE_WCHAR_H
568
569/* Create a Unicode Object from the whcar_t buffer w of the given
570 size.
571
572 The buffer is copied into the new object. */
573
Mark Hammond91a681d2002-08-12 07:21:58 +0000574PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000575 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000577 );
578
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000579/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000580 most size wchar_t characters are copied.
581
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000582 Note that the resulting wchar_t string may or may not be
583 0-terminated. It is the responsibility of the caller to make sure
584 that the wchar_t string is 0-terminated in case this is required by
585 the application.
586
587 Returns the number of wchar_t characters copied (excluding a
588 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000589 error. */
590
Martin v. Löwis18e16552006-02-15 17:27:45 +0000591PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000592 PyUnicodeObject *unicode, /* Unicode object */
593 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000594 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000595 );
596
597#endif
598
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000599/* --- Unicode ordinals --------------------------------------------------- */
600
601/* Create a Unicode Object from the given Unicode code point ordinal.
602
603 The ordinal must be in range(0x10000) on narrow Python builds
604 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
605 raised in case it is not.
606
607*/
608
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000609PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000610
Guido van Rossumd8225182000-03-10 22:33:05 +0000611/* === Builtin Codecs =====================================================
612
613 Many of these APIs take two arguments encoding and errors. These
614 parameters encoding and errors have the same semantics as the ones
615 of the builtin unicode() API.
616
Fred Drakecb093fe2000-05-09 19:51:53 +0000617 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000618
619 Error handling is set by errors which may also be set to NULL
620 meaning to use the default handling defined for the codec. Default
621 error handling for all builtin codecs is "strict" (ValueErrors are
622 raised).
623
624 The codecs all use a similar interface. Only deviation from the
625 generic ones are documented.
626
627*/
628
Fred Drakecb093fe2000-05-09 19:51:53 +0000629/* --- Manage the default encoding ---------------------------------------- */
630
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000631/* Return a Python string holding the default encoded value of the
632 Unicode object.
633
634 The resulting string is cached in the Unicode object for subsequent
635 usage by this function. The cached version is needed to implement
636 the character buffer interface and will live (at least) as long as
637 the Unicode object itself.
638
639 The refcount of the string is *not* incremented.
640
641 *** Exported for internal use by the interpreter only !!! ***
642
643*/
644
Mark Hammond91a681d2002-08-12 07:21:58 +0000645PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000646 PyObject *, const char *);
647
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000648/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
649
650 If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
651 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
652 invalid characters with '?'.
653
654 The function is intended to be used for paths and file names only
655 during bootstrapping process where the codecs are not set up.
656*/
657
658PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
659 const char *s /* encoded string */
660 );
661
Christian Heimes5894ba72007-11-04 11:43:14 +0000662PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
663 const char *s, /* encoded string */
664 Py_ssize_t size /* size */
665 );
666
667
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000668/* Return a char* holding the UTF-8 encoded value of the
669 Unicode object.
670
671 DEPRECATED: use PyUnicode_AsStringAndSize() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000672*/
673
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000674PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *);
675
676/* Returns the UTF-8 encoding, and its size.
677
678 If the output argument is NULL, no size is stored.
679 */
680
Martin v. Löwis5b222132007-06-10 09:51:05 +0000681PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*);
682
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000683/* Returns the UTF-8 encoding.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000684
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000685 This is equivalent to PyUnicode_AsStringAndSize(x, NULL).
Fred Drakecb093fe2000-05-09 19:51:53 +0000686
Fred Drakecb093fe2000-05-09 19:51:53 +0000687 */
688
Mark Hammond91a681d2002-08-12 07:21:58 +0000689PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000690
691/* Sets the currently active default encoding.
692
693 Returns 0 on success, -1 in case of an error.
694
695 */
696
Mark Hammond91a681d2002-08-12 07:21:58 +0000697PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
Fred Drakecb093fe2000-05-09 19:51:53 +0000698 const char *encoding /* Encoding name in standard form */
699 );
700
Guido van Rossumd8225182000-03-10 22:33:05 +0000701/* --- Generic Codecs ----------------------------------------------------- */
702
703/* Create a Unicode object by decoding the encoded string s of the
704 given size. */
705
Mark Hammond91a681d2002-08-12 07:21:58 +0000706PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000707 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000708 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000709 const char *encoding, /* encoding */
710 const char *errors /* error handling */
711 );
712
713/* Encodes a Py_UNICODE buffer of the given size and returns a
714 Python string object. */
715
Mark Hammond91a681d2002-08-12 07:21:58 +0000716PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000717 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000718 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000719 const char *encoding, /* encoding */
720 const char *errors /* error handling */
721 );
722
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000723/* Encodes a Unicode object and returns the result as Python
724 object. */
725
726PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
727 PyObject *unicode, /* Unicode object */
728 const char *encoding, /* encoding */
729 const char *errors /* error handling */
730 );
731
Guido van Rossumd8225182000-03-10 22:33:05 +0000732/* Encodes a Unicode object and returns the result as Python string
733 object. */
734
Mark Hammond91a681d2002-08-12 07:21:58 +0000735PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000736 PyObject *unicode, /* Unicode object */
737 const char *encoding, /* encoding */
738 const char *errors /* error handling */
739 );
740
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000741PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
742 PyObject* string /* 256 character map */
743 );
744
745
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000746/* --- UTF-7 Codecs ------------------------------------------------------- */
747
Mark Hammond91a681d2002-08-12 07:21:58 +0000748PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000749 const char *string, /* UTF-7 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000750 Py_ssize_t length, /* size of string */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000751 const char *errors /* error handling */
752 );
753
Mark Hammond91a681d2002-08-12 07:21:58 +0000754PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000755 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000756 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000757 int encodeSetO, /* force the encoder to encode characters in
758 Set O, as described in RFC2152 */
759 int encodeWhiteSpace, /* force the encoder to encode space, tab,
760 carriage return and linefeed characters */
761 const char *errors /* error handling */
762 );
763
Guido van Rossumd8225182000-03-10 22:33:05 +0000764/* --- UTF-8 Codecs ------------------------------------------------------- */
765
Mark Hammond91a681d2002-08-12 07:21:58 +0000766PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000767 const char *string, /* UTF-8 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000768 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000769 const char *errors /* error handling */
770 );
771
Walter Dörwald69652032004-09-07 20:24:22 +0000772PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
773 const char *string, /* UTF-8 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000774 Py_ssize_t length, /* size of string */
Walter Dörwald69652032004-09-07 20:24:22 +0000775 const char *errors, /* error handling */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000776 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000777 );
778
Mark Hammond91a681d2002-08-12 07:21:58 +0000779PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000780 PyObject *unicode /* Unicode object */
781 );
782
Mark Hammond91a681d2002-08-12 07:21:58 +0000783PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000784 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000785 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000786 const char *errors /* error handling */
787 );
788
Walter Dörwald41980ca2007-08-16 21:55:45 +0000789/* --- UTF-32 Codecs ------------------------------------------------------ */
790
791/* Decodes length bytes from a UTF-32 encoded buffer string and returns
792 the corresponding Unicode object.
793
794 errors (if non-NULL) defines the error handling. It defaults
795 to "strict".
796
797 If byteorder is non-NULL, the decoder starts decoding using the
798 given byte order:
799
800 *byteorder == -1: little endian
801 *byteorder == 0: native order
802 *byteorder == 1: big endian
803
804 In native mode, the first four bytes of the stream are checked for a
805 BOM mark. If found, the BOM mark is analysed, the byte order
806 adjusted and the BOM skipped. In the other modes, no BOM mark
807 interpretation is done. After completion, *byteorder is set to the
808 current byte order at the end of input data.
809
810 If byteorder is NULL, the codec starts in native order mode.
811
812*/
813
814PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
815 const char *string, /* UTF-32 encoded string */
816 Py_ssize_t length, /* size of string */
817 const char *errors, /* error handling */
818 int *byteorder /* pointer to byteorder to use
819 0=native;-1=LE,1=BE; updated on
820 exit */
821 );
822
823PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
824 const char *string, /* UTF-32 encoded string */
825 Py_ssize_t length, /* size of string */
826 const char *errors, /* error handling */
827 int *byteorder, /* pointer to byteorder to use
828 0=native;-1=LE,1=BE; updated on
829 exit */
830 Py_ssize_t *consumed /* bytes consumed */
831 );
832
833/* Returns a Python string using the UTF-32 encoding in native byte
834 order. The string always starts with a BOM mark. */
835
836PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
837 PyObject *unicode /* Unicode object */
838 );
839
840/* Returns a Python string object holding the UTF-32 encoded value of
841 the Unicode data.
842
843 If byteorder is not 0, output is written according to the following
844 byte order:
845
846 byteorder == -1: little endian
847 byteorder == 0: native byte order (writes a BOM mark)
848 byteorder == 1: big endian
849
850 If byteorder is 0, the output string will always start with the
851 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
852 prepended.
853
854*/
855
856PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
857 const Py_UNICODE *data, /* Unicode char buffer */
858 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
859 const char *errors, /* error handling */
860 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
861 );
862
Guido van Rossumd8225182000-03-10 22:33:05 +0000863/* --- UTF-16 Codecs ------------------------------------------------------ */
864
Guido van Rossum9e896b32000-04-05 20:11:21 +0000865/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000866 the corresponding Unicode object.
867
868 errors (if non-NULL) defines the error handling. It defaults
869 to "strict".
870
871 If byteorder is non-NULL, the decoder starts decoding using the
872 given byte order:
873
874 *byteorder == -1: little endian
875 *byteorder == 0: native order
876 *byteorder == 1: big endian
877
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000878 In native mode, the first two bytes of the stream are checked for a
879 BOM mark. If found, the BOM mark is analysed, the byte order
880 adjusted and the BOM skipped. In the other modes, no BOM mark
881 interpretation is done. After completion, *byteorder is set to the
882 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000883
884 If byteorder is NULL, the codec starts in native order mode.
885
886*/
887
Mark Hammond91a681d2002-08-12 07:21:58 +0000888PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000889 const char *string, /* UTF-16 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000890 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000891 const char *errors, /* error handling */
892 int *byteorder /* pointer to byteorder to use
893 0=native;-1=LE,1=BE; updated on
894 exit */
895 );
896
Walter Dörwald69652032004-09-07 20:24:22 +0000897PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
898 const char *string, /* UTF-16 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000899 Py_ssize_t length, /* size of string */
Walter Dörwald69652032004-09-07 20:24:22 +0000900 const char *errors, /* error handling */
901 int *byteorder, /* pointer to byteorder to use
902 0=native;-1=LE,1=BE; updated on
903 exit */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000904 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000905 );
906
Guido van Rossumd8225182000-03-10 22:33:05 +0000907/* Returns a Python string using the UTF-16 encoding in native byte
908 order. The string always starts with a BOM mark. */
909
Mark Hammond91a681d2002-08-12 07:21:58 +0000910PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000911 PyObject *unicode /* Unicode object */
912 );
913
914/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000915 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000916
917 If byteorder is not 0, output is written according to the following
918 byte order:
919
920 byteorder == -1: little endian
921 byteorder == 0: native byte order (writes a BOM mark)
922 byteorder == 1: big endian
923
924 If byteorder is 0, the output string will always start with the
925 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
926 prepended.
927
928 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
929 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000930 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000931
932*/
933
Mark Hammond91a681d2002-08-12 07:21:58 +0000934PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000935 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000936 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000937 const char *errors, /* error handling */
938 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
939 );
940
941/* --- Unicode-Escape Codecs ---------------------------------------------- */
942
Mark Hammond91a681d2002-08-12 07:21:58 +0000943PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000944 const char *string, /* Unicode-Escape encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000945 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000946 const char *errors /* error handling */
947 );
948
Mark Hammond91a681d2002-08-12 07:21:58 +0000949PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000950 PyObject *unicode /* Unicode object */
951 );
952
Mark Hammond91a681d2002-08-12 07:21:58 +0000953PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000954 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000955 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000956 );
957
958/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
959
Mark Hammond91a681d2002-08-12 07:21:58 +0000960PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000961 const char *string, /* Raw-Unicode-Escape encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000963 const char *errors /* error handling */
964 );
965
Mark Hammond91a681d2002-08-12 07:21:58 +0000966PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000967 PyObject *unicode /* Unicode object */
968 );
969
Mark Hammond91a681d2002-08-12 07:21:58 +0000970PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000971 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000972 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000973 );
974
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000975/* --- Unicode Internal Codec ---------------------------------------------
976
977 Only for internal use in _codecsmodule.c */
978
979PyObject *_PyUnicode_DecodeUnicodeInternal(
980 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000981 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000982 const char *errors
983 );
984
Guido van Rossumd8225182000-03-10 22:33:05 +0000985/* --- Latin-1 Codecs -----------------------------------------------------
986
987 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
988
989*/
990
Mark Hammond91a681d2002-08-12 07:21:58 +0000991PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +0000992 const char *string, /* Latin-1 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000993 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000994 const char *errors /* error handling */
995 );
996
Mark Hammond91a681d2002-08-12 07:21:58 +0000997PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000998 PyObject *unicode /* Unicode object */
999 );
1000
Mark Hammond91a681d2002-08-12 07:21:58 +00001001PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +00001002 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001003 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001004 const char *errors /* error handling */
1005 );
1006
1007/* --- ASCII Codecs -------------------------------------------------------
1008
1009 Only 7-bit ASCII data is excepted. All other codes generate errors.
1010
1011*/
1012
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +00001014 const char *string, /* ASCII encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001015 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001016 const char *errors /* error handling */
1017 );
1018
Mark Hammond91a681d2002-08-12 07:21:58 +00001019PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Guido van Rossumd8225182000-03-10 22:33:05 +00001020 PyObject *unicode /* Unicode object */
1021 );
1022
Mark Hammond91a681d2002-08-12 07:21:58 +00001023PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001025 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001026 const char *errors /* error handling */
1027 );
1028
1029/* --- Character Map Codecs -----------------------------------------------
1030
1031 This codec uses mappings to encode and decode characters.
1032
1033 Decoding mappings must map single string characters to single
1034 Unicode characters, integers (which are then interpreted as Unicode
1035 ordinals) or None (meaning "undefined mapping" and causing an
1036 error).
1037
1038 Encoding mappings must map single Unicode characters to single
1039 string characters, integers (which are then interpreted as Latin-1
1040 ordinals) or None (meaning "undefined mapping" and causing an
1041 error).
1042
1043 If a character lookup fails with a LookupError, the character is
1044 copied as-is meaning that its ordinal value will be interpreted as
1045 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1046 to contain those mappings which map characters to different code
1047 points.
1048
1049*/
1050
Mark Hammond91a681d2002-08-12 07:21:58 +00001051PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001052 const char *string, /* Encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001053 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001054 PyObject *mapping, /* character mapping
1055 (char ordinal -> unicode ordinal) */
1056 const char *errors /* error handling */
1057 );
1058
Mark Hammond91a681d2002-08-12 07:21:58 +00001059PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Guido van Rossumd8225182000-03-10 22:33:05 +00001060 PyObject *unicode, /* Unicode object */
1061 PyObject *mapping /* character mapping
1062 (unicode ordinal -> char ordinal) */
1063 );
1064
Mark Hammond91a681d2002-08-12 07:21:58 +00001065PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001066 const Py_UNICODE *data, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001067 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001068 PyObject *mapping, /* character mapping
1069 (unicode ordinal -> char ordinal) */
1070 const char *errors /* error handling */
1071 );
1072
1073/* Translate a Py_UNICODE buffer of the given length by applying a
1074 character mapping table to it and return the resulting Unicode
1075 object.
1076
1077 The mapping table must map Unicode ordinal integers to Unicode
1078 ordinal integers or None (causing deletion of the character).
1079
1080 Mapping tables may be dictionaries or sequences. Unmapped character
1081 ordinals (ones which cause a LookupError) are left untouched and
1082 are copied as-is.
1083
1084*/
1085
Mark Hammond91a681d2002-08-12 07:21:58 +00001086PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001087 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001088 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001089 PyObject *table, /* Translate table */
1090 const char *errors /* error handling */
1091 );
1092
Guido van Rossumefec1152000-03-28 02:01:15 +00001093#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +00001094
Guido van Rossumefec1152000-03-28 02:01:15 +00001095/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001096
Mark Hammond91a681d2002-08-12 07:21:58 +00001097PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001098 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001099 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001100 const char *errors /* error handling */
1101 );
1102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001103PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1104 const char *string, /* MBCS encoded string */
1105 Py_ssize_t length, /* size of string */
1106 const char *errors, /* error handling */
1107 Py_ssize_t *consumed /* bytes consumed */
1108 );
1109
Mark Hammond91a681d2002-08-12 07:21:58 +00001110PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001111 PyObject *unicode /* Unicode object */
1112 );
1113
Mark Hammond91a681d2002-08-12 07:21:58 +00001114PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001115 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001116 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001117 const char *errors /* error handling */
1118 );
1119
Guido van Rossumefec1152000-03-28 02:01:15 +00001120#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001121
Guido van Rossum9e896b32000-04-05 20:11:21 +00001122/* --- Decimal Encoder ---------------------------------------------------- */
1123
1124/* Takes a Unicode string holding a decimal value and writes it into
1125 an output buffer using standard ASCII digit codes.
1126
1127 The output buffer has to provide at least length+1 bytes of storage
1128 area. The output string is 0-terminated.
1129
1130 The encoder converts whitespace to ' ', decimal characters to their
1131 corresponding ASCII digit and all other Latin-1 characters except
1132 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1133 are treated as errors. This includes embedded NULL bytes.
1134
1135 Error handling is defined by the errors argument:
1136
1137 NULL or "strict": raise a ValueError
1138 "ignore": ignore the wrong characters (these are not copied to the
1139 output buffer)
1140 "replace": replaces illegal characters with '?'
1141
1142 Returns 0 on success, -1 on failure.
1143
1144*/
1145
Mark Hammond91a681d2002-08-12 07:21:58 +00001146PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Guido van Rossum9e896b32000-04-05 20:11:21 +00001147 Py_UNICODE *s, /* Unicode buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001148 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001149 char *output, /* Output buffer; must have size >= length */
1150 const char *errors /* error handling */
1151 );
1152
Guido van Rossumd8225182000-03-10 22:33:05 +00001153/* --- Methods & Slots ----------------------------------------------------
1154
1155 These are capable of handling Unicode objects and strings on input
1156 (we refer to them as strings in the descriptions) and return
1157 Unicode objects or integers as apporpriate. */
1158
1159/* Concat two strings giving a new Unicode string. */
1160
Mark Hammond91a681d2002-08-12 07:21:58 +00001161PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Guido van Rossumd8225182000-03-10 22:33:05 +00001162 PyObject *left, /* Left string */
1163 PyObject *right /* Right string */
1164 );
1165
Walter Dörwald1ab83302007-05-18 17:15:44 +00001166/* Concat two strings and put the result in *pleft
1167 (sets *pleft to NULL on error) */
1168
1169PyAPI_FUNC(void) PyUnicode_Append(
1170 PyObject **pleft, /* Pointer to left string */
1171 PyObject *right /* Right string */
1172 );
1173
1174/* Concat two strings, put the result in *pleft and drop the right object
1175 (sets *pleft to NULL on error) */
1176
1177PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1178 PyObject **pleft, /* Pointer to left string */
1179 PyObject *right /* Right string */
1180 );
1181
Guido van Rossumd8225182000-03-10 22:33:05 +00001182/* Split a string giving a list of Unicode strings.
1183
1184 If sep is NULL, splitting will be done at all whitespace
1185 substrings. Otherwise, splits occur at the given separator.
1186
1187 At most maxsplit splits will be done. If negative, no limit is set.
1188
1189 Separators are not included in the resulting list.
1190
1191*/
1192
Mark Hammond91a681d2002-08-12 07:21:58 +00001193PyAPI_FUNC(PyObject*) PyUnicode_Split(
Guido van Rossumd8225182000-03-10 22:33:05 +00001194 PyObject *s, /* String to split */
1195 PyObject *sep, /* String separator */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001196 Py_ssize_t maxsplit /* Maxsplit count */
Guido van Rossumd8225182000-03-10 22:33:05 +00001197 );
1198
1199/* Dito, but split at line breaks.
1200
1201 CRLF is considered to be one line break. Line breaks are not
1202 included in the resulting list. */
1203
Mark Hammond91a681d2002-08-12 07:21:58 +00001204PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Guido van Rossumd8225182000-03-10 22:33:05 +00001205 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +00001206 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +00001207 );
1208
Thomas Wouters477c8d52006-05-27 19:21:47 +00001209/* Partition a string using a given separator. */
1210
1211PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1212 PyObject *s, /* String to partition */
1213 PyObject *sep /* String separator */
1214 );
1215
1216/* Partition a string using a given separator, searching from the end of the
1217 string. */
1218
1219PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1220 PyObject *s, /* String to partition */
1221 PyObject *sep /* String separator */
1222 );
1223
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001224/* Split a string giving a list of Unicode strings.
1225
1226 If sep is NULL, splitting will be done at all whitespace
1227 substrings. Otherwise, splits occur at the given separator.
1228
1229 At most maxsplit splits will be done. But unlike PyUnicode_Split
1230 PyUnicode_RSplit splits from the end of the string. If negative,
1231 no limit is set.
1232
1233 Separators are not included in the resulting list.
1234
1235*/
1236
1237PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1238 PyObject *s, /* String to split */
1239 PyObject *sep, /* String separator */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001240 Py_ssize_t maxsplit /* Maxsplit count */
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001241 );
1242
Guido van Rossumd8225182000-03-10 22:33:05 +00001243/* Translate a string by applying a character mapping table to it and
1244 return the resulting Unicode object.
1245
1246 The mapping table must map Unicode ordinal integers to Unicode
1247 ordinal integers or None (causing deletion of the character).
1248
1249 Mapping tables may be dictionaries or sequences. Unmapped character
1250 ordinals (ones which cause a LookupError) are left untouched and
1251 are copied as-is.
1252
1253*/
1254
Mark Hammond91a681d2002-08-12 07:21:58 +00001255PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Guido van Rossumd8225182000-03-10 22:33:05 +00001256 PyObject *str, /* String */
1257 PyObject *table, /* Translate table */
1258 const char *errors /* error handling */
1259 );
1260
1261/* Join a sequence of strings using the given separator and return
1262 the resulting Unicode string. */
1263
Mark Hammond91a681d2002-08-12 07:21:58 +00001264PyAPI_FUNC(PyObject*) PyUnicode_Join(
Guido van Rossumd8225182000-03-10 22:33:05 +00001265 PyObject *separator, /* Separator string */
1266 PyObject *seq /* Sequence object */
1267 );
1268
1269/* Return 1 if substr matches str[start:end] at the given tail end, 0
1270 otherwise. */
1271
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Guido van Rossumd8225182000-03-10 22:33:05 +00001273 PyObject *str, /* String */
1274 PyObject *substr, /* Prefix or Suffix string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 Py_ssize_t start, /* Start index */
1276 Py_ssize_t end, /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001277 int direction /* Tail end: -1 prefix, +1 suffix */
1278 );
1279
1280/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001281 given search direction or -1 if not found. -2 is returned in case
1282 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001283
Martin v. Löwis18e16552006-02-15 17:27:45 +00001284PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Guido van Rossumd8225182000-03-10 22:33:05 +00001285 PyObject *str, /* String */
1286 PyObject *substr, /* Substring to find */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t start, /* Start index */
1288 Py_ssize_t end, /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001289 int direction /* Find direction: +1 forward, -1 backward */
1290 );
1291
Barry Warsaw51ac5802000-03-20 16:36:48 +00001292/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001293
Martin v. Löwis18e16552006-02-15 17:27:45 +00001294PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Guido van Rossumd8225182000-03-10 22:33:05 +00001295 PyObject *str, /* String */
1296 PyObject *substr, /* Substring to count */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001297 Py_ssize_t start, /* Start index */
1298 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001299 );
1300
Barry Warsaw51ac5802000-03-20 16:36:48 +00001301/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 and return the resulting Unicode object. */
1303
Mark Hammond91a681d2002-08-12 07:21:58 +00001304PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Guido van Rossumd8225182000-03-10 22:33:05 +00001305 PyObject *str, /* String */
1306 PyObject *substr, /* Substring to find */
1307 PyObject *replstr, /* Substring to replace */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001308 Py_ssize_t maxcount /* Max. number of replacements to apply;
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 -1 = all */
1310 );
1311
1312/* Compare two strings and return -1, 0, 1 for less than, equal,
1313 greater than resp. */
1314
Mark Hammond91a681d2002-08-12 07:21:58 +00001315PyAPI_FUNC(int) PyUnicode_Compare(
Guido van Rossumd8225182000-03-10 22:33:05 +00001316 PyObject *left, /* Left string */
1317 PyObject *right /* Right string */
1318 );
1319
Martin v. Löwis5b222132007-06-10 09:51:05 +00001320PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1321 PyObject *left,
1322 const char *right
1323 );
1324
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001325/* Rich compare two strings and return one of the following:
1326
1327 - NULL in case an exception was raised
1328 - Py_True or Py_False for successfuly comparisons
1329 - Py_NotImplemented in case the type combination is unknown
1330
1331 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1332 case the conversion of the arguments to Unicode fails with a
1333 UnicodeDecodeError.
1334
1335 Possible values for op:
1336
1337 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1338
1339*/
1340
1341PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1342 PyObject *left, /* Left string */
1343 PyObject *right, /* Right string */
1344 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1345 );
1346
Thomas Wouters7e474022000-07-16 12:04:32 +00001347/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001348 the resulting Unicode string. */
1349
Mark Hammond91a681d2002-08-12 07:21:58 +00001350PyAPI_FUNC(PyObject *) PyUnicode_Format(
Guido van Rossumd8225182000-03-10 22:33:05 +00001351 PyObject *format, /* Format string */
1352 PyObject *args /* Argument tuple or dictionary */
1353 );
1354
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001355/* Checks whether element is contained in container and return 1/0
1356 accordingly.
1357
1358 element has to coerce to an one element Unicode string. -1 is
1359 returned in case of an error. */
1360
Mark Hammond91a681d2002-08-12 07:21:58 +00001361PyAPI_FUNC(int) PyUnicode_Contains(
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001362 PyObject *container, /* Container string */
1363 PyObject *element /* Element string */
1364 );
1365
Martin v. Löwis47383402007-08-15 07:32:56 +00001366/* Checks whether argument is a valid identifier. */
1367
1368PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1369
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001370/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001371PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001372 PyUnicodeObject *self,
1373 int striptype,
1374 PyObject *sepobj
1375 );
1376
Guido van Rossumd8225182000-03-10 22:33:05 +00001377/* === Characters Type APIs =============================================== */
1378
1379/* These should not be used directly. Use the Py_UNICODE_IS* and
1380 Py_UNICODE_TO* macros instead.
1381
1382 These APIs are implemented in Objects/unicodectype.c.
1383
1384*/
1385
Mark Hammond91a681d2002-08-12 07:21:58 +00001386PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001387 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001388 );
1389
Mark Hammond91a681d2002-08-12 07:21:58 +00001390PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001391 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001392 );
1393
Mark Hammond91a681d2002-08-12 07:21:58 +00001394PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001395 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001396 );
1397
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001398PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1399 Py_UNICODE ch /* Unicode character */
1400 );
1401
1402PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1403 Py_UNICODE ch /* Unicode character */
1404 );
1405
Mark Hammond91a681d2002-08-12 07:21:58 +00001406PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Tim Peters2576c972005-10-29 02:33:18 +00001407 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001408 );
1409
Mark Hammond91a681d2002-08-12 07:21:58 +00001410PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Tim Peters2576c972005-10-29 02:33:18 +00001411 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001412 );
1413
Mark Hammond91a681d2002-08-12 07:21:58 +00001414PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001415 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001416 );
1417
Mark Hammond91a681d2002-08-12 07:21:58 +00001418PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001419 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001420 );
1421
Mark Hammond91a681d2002-08-12 07:21:58 +00001422PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001423 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001424 );
1425
Mark Hammond91a681d2002-08-12 07:21:58 +00001426PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001427 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001428 );
1429
Mark Hammond91a681d2002-08-12 07:21:58 +00001430PyAPI_FUNC(int) _PyUnicode_ToDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001431 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001432 );
1433
Mark Hammond91a681d2002-08-12 07:21:58 +00001434PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001435 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001436 );
1437
Mark Hammond91a681d2002-08-12 07:21:58 +00001438PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001439 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001440 );
1441
Mark Hammond91a681d2002-08-12 07:21:58 +00001442PyAPI_FUNC(int) _PyUnicode_IsDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001443 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001444 );
1445
Mark Hammond91a681d2002-08-12 07:21:58 +00001446PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001447 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001448 );
1449
Mark Hammond91a681d2002-08-12 07:21:58 +00001450PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001451 Py_UNICODE ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001452 );
1453
Martin v. Löwis5b222132007-06-10 09:51:05 +00001454PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1455
1456PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1457 Py_UNICODE *s1, const Py_UNICODE *s2);
1458
1459PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1460 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1461
1462PyAPI_FUNC(int) Py_UNICODE_strcmp(
1463 const Py_UNICODE *s1, const Py_UNICODE *s2);
1464
1465PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1466 const Py_UNICODE *s, Py_UNICODE c
1467 );
1468
Guido van Rossumd8225182000-03-10 22:33:05 +00001469#ifdef __cplusplus
1470}
1471#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001472#endif /* !Py_UNICODEOBJECT_H */