blob: f3c37fe63b6fb9cd0377ffe26b7943385c98dc2d [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000055#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000056
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000061/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
62 properly set, but the default rules below doesn't set it. I'll
63 sort this out some other day -- fredrik@pythonware.com */
64
65#ifndef Py_UNICODE_SIZE
66#error Must define Py_UNICODE_SIZE
67#endif
68
Fredrik Lundh8f455852001-06-27 18:59:43 +000069/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
70 strings are stored as UCS-2 (with limited support for UTF-16) */
71
72#if Py_UNICODE_SIZE >= 4
73#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000074#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000075
Guido van Rossumd8225182000-03-10 22:33:05 +000076/* Set these flags if the platform has "wchar.h", "wctype.h" and the
77 wchar_t type is a 16-bit unsigned type */
78/* #define HAVE_WCHAR_H */
79/* #define HAVE_USABLE_WCHAR_T */
80
81/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000082#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000083
Fredrik Lundh1294ad02001-06-26 17:17:07 +000084/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000085# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000086# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000087# define PY_UNICODE_TYPE wchar_t
88# endif
89
Fredrik Lundh8f455852001-06-27 18:59:43 +000090# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000091# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000092# endif
93
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97 through the interface functions PyUnicode_FromWideChar() and
98 PyUnicode_AsWideChar(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109# include <time.h>
110# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000111# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000112#endif
113
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000114/*
115 * Use this typedef when you need to represent a UTF-16 surrogate pair
116 * as single unsigned integer.
117 */
118#if SIZEOF_INT >= 4
119typedef unsigned int Py_UCS4;
120#elif SIZEOF_LONG >= 4
121typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000122#endif
123
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000124typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000125
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000126/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
127
128/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
129 produce different external names and thus cause import errors in
130 case Python interpreters and extensions with mixed compiled in
131 Unicode width assumptions are combined. */
132
133#ifndef Py_UNICODE_WIDE
134
135# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
136# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000137# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000138# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
139# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
140# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000141# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000142# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
143# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
144# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
145# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
146# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
147# define PyUnicode_Compare PyUnicodeUCS2_Compare
148# define PyUnicode_Concat PyUnicodeUCS2_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000149# define PyUnicode_Append PyUnicodeUCS2_Append
150# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000151# define PyUnicode_Contains PyUnicodeUCS2_Contains
152# define PyUnicode_Count PyUnicodeUCS2_Count
153# define PyUnicode_Decode PyUnicodeUCS2_Decode
154# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
155# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
156# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000157# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000158# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000159# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000160# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
161# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000162# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000163# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000164# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000165# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000166# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
167# define PyUnicode_Encode PyUnicodeUCS2_Encode
168# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
169# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
170# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
171# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
172# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000173# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000174# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
175# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
176# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
177# define PyUnicode_Find PyUnicodeUCS2_Find
178# define PyUnicode_Format PyUnicodeUCS2_Format
179# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
180# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000181# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000182# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000183# define PyUnicode_FromString PyUnicodeUCS2_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000184# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
185# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
186# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
Walter Dörwald14176a52007-05-18 17:04:42 +0000187# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000188# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
189# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
190# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000191# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000192# define PyUnicode_Join PyUnicodeUCS2_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193# define PyUnicode_Partition PyUnicodeUCS2_Partition
194# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
195# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000196# define PyUnicode_Replace PyUnicodeUCS2_Replace
197# define PyUnicode_Resize PyUnicodeUCS2_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000198# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000199# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
200# define PyUnicode_Split PyUnicodeUCS2_Split
201# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
202# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
203# define PyUnicode_Translate PyUnicodeUCS2_Translate
204# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
205# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
206# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
207# define _PyUnicode_Init _PyUnicodeUCS2_Init
208# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
209# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
210# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
211# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
212# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
213# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
214# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000215# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
216# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000217# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
218# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
219# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
220# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
221# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
222# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
223# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
224# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
225
226#else
227
228# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
229# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000230# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000231# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
232# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
233# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
Walter Dörwald41980ca2007-08-16 21:55:45 +0000234# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000235# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
236# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
237# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
238# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
239# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
240# define PyUnicode_Compare PyUnicodeUCS4_Compare
241# define PyUnicode_Concat PyUnicodeUCS4_Concat
Walter Dörwald1ab83302007-05-18 17:15:44 +0000242# define PyUnicode_Append PyUnicodeUCS4_Append
243# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000244# define PyUnicode_Contains PyUnicodeUCS4_Contains
245# define PyUnicode_Count PyUnicodeUCS4_Count
246# define PyUnicode_Decode PyUnicodeUCS4_Decode
247# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
248# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
249# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000250# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
Christian Heimes5894ba72007-11-04 11:43:14 +0000251# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000252# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000253# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
254# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000255# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
Walter Dörwald69652032004-09-07 20:24:22 +0000256# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000257# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
Walter Dörwald69652032004-09-07 20:24:22 +0000258# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000259# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
260# define PyUnicode_Encode PyUnicodeUCS4_Encode
261# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
262# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
263# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
264# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
265# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
Walter Dörwald41980ca2007-08-16 21:55:45 +0000266# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000267# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
268# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
269# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
270# define PyUnicode_Find PyUnicodeUCS4_Find
271# define PyUnicode_Format PyUnicodeUCS4_Format
272# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
273# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000274# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000275# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000276# define PyUnicode_FromString PyUnicodeUCS4_FromString
Walter Dörwaldd2034312007-05-18 16:29:38 +0000277# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
278# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
279# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000280# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
281# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
282# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
283# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
Martin v. Löwis47383402007-08-15 07:32:56 +0000284# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000285# define PyUnicode_Join PyUnicodeUCS4_Join
Thomas Wouters477c8d52006-05-27 19:21:47 +0000286# define PyUnicode_Partition PyUnicodeUCS4_Partition
287# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
288# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000289# define PyUnicode_Replace PyUnicodeUCS4_Replace
290# define PyUnicode_Resize PyUnicodeUCS4_Resize
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000291# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000292# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
293# define PyUnicode_Split PyUnicodeUCS4_Split
294# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
295# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
296# define PyUnicode_Translate PyUnicodeUCS4_Translate
297# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
298# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
299# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
300# define _PyUnicode_Init _PyUnicodeUCS4_Init
301# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
302# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
303# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
304# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
305# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
306# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
307# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000308# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
309# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000310# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
311# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
312# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
313# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
314# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
315# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
316# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
317# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
318
319
320#endif
321
Guido van Rossumd8225182000-03-10 22:33:05 +0000322/* --- Internal Unicode Operations ---------------------------------------- */
323
324/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000325 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
Raymond Hettinger57341c32004-10-31 05:46:59 +0000326 configure Python using --with-wctype-functions. This reduces the
Barry Warsaw51ac5802000-03-20 16:36:48 +0000327 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000328
329#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
330
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000331#include <wctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000332
333#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
334
335#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
336#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
337#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
338#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
339
340#define Py_UNICODE_TOLOWER(ch) towlower(ch)
341#define Py_UNICODE_TOUPPER(ch) towupper(ch)
342#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
343
344#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
345#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
346#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
347
348#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
349#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
350#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
351
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000352#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
353
Guido van Rossumd8225182000-03-10 22:33:05 +0000354#else
355
356#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
357
358#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
359#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
360#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
361#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
362
363#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
364#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
365#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
366
367#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
368#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
369#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
370
371#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
372#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
373#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
374
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000375#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000376
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000377#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000378
379#define Py_UNICODE_ISALNUM(ch) \
380 (Py_UNICODE_ISALPHA(ch) || \
381 Py_UNICODE_ISDECIMAL(ch) || \
382 Py_UNICODE_ISDIGIT(ch) || \
383 Py_UNICODE_ISNUMERIC(ch))
384
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000385#define Py_UNICODE_COPY(target, source, length) \
386 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
Guido van Rossumd8225182000-03-10 22:33:05 +0000387
388#define Py_UNICODE_FILL(target, value, length) do\
Thomas Wouters477c8d52006-05-27 19:21:47 +0000389 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
390 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
391 } while (0)
Guido van Rossumd8225182000-03-10 22:33:05 +0000392
Thomas Wouters477c8d52006-05-27 19:21:47 +0000393/* check if substring matches at given offset. the offset must be
394 valid, and the substring must not be empty */
395#define Py_UNICODE_MATCH(string, offset, substring) \
396 ((*((string)->str + (offset)) == *((substring)->str)) && \
397 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
398 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000399
Barry Warsaw51ac5802000-03-20 16:36:48 +0000400#ifdef __cplusplus
401extern "C" {
402#endif
403
Guido van Rossumd8225182000-03-10 22:33:05 +0000404/* --- Unicode Type ------------------------------------------------------- */
405
406typedef struct {
407 PyObject_HEAD
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t length; /* Length of raw Unicode data in buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000409 Py_UNICODE *str; /* Raw Unicode buffer */
410 long hash; /* Hash value; -1 if not set */
Walter Dörwald16807132007-05-25 13:52:07 +0000411 int state; /* != 0 if interned. In this case the two
412 * references from the dictionary to this object
413 * are *not* counted in ob_refcnt. */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000414 PyObject *defenc; /* (Default) Encoded version as Python
415 string, or NULL; this is used for
416 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000417} PyUnicodeObject;
418
Mark Hammond91a681d2002-08-12 07:21:58 +0000419PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000420
Walter Dörwald16807132007-05-25 13:52:07 +0000421#define SSTATE_NOT_INTERNED 0
422#define SSTATE_INTERNED_MORTAL 1
423#define SSTATE_INTERNED_IMMORTAL 2
424
Thomas Wouters27d517b2007-02-25 20:39:11 +0000425#define PyUnicode_Check(op) \
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000426 PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS)
427#define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000428
429/* Fast access macros */
430#define PyUnicode_GET_SIZE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000431 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
Guido van Rossumd8225182000-03-10 22:33:05 +0000432#define PyUnicode_GET_DATA_SIZE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000433 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
Guido van Rossumd8225182000-03-10 22:33:05 +0000434#define PyUnicode_AS_UNICODE(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000435 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000436#define PyUnicode_AS_DATA(op) \
Martin v. Löwis5b222132007-06-10 09:51:05 +0000437 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
Guido van Rossumd8225182000-03-10 22:33:05 +0000438
439/* --- Constants ---------------------------------------------------------- */
440
441/* This Unicode character will be used as replacement character during
442 decoding if the errors argument is set to "replace". Note: the
443 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
444 Unicode 3.0. */
445
446#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
447
448/* === Public API ========================================================= */
449
450/* --- Plain Py_UNICODE --------------------------------------------------- */
451
452/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 size.
454
455 u may be NULL which causes the contents to be undefined. It is the
456 user's responsibility to fill in the needed data afterwards. Note
457 that modifying the Unicode object contents after construction is
458 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000459
460 The buffer is copied into the new object. */
461
Mark Hammond91a681d2002-08-12 07:21:58 +0000462PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000463 const Py_UNICODE *u, /* Unicode buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000464 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000465 );
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
468PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
469 const char *u, /* char buffer */
470 Py_ssize_t size /* size of buffer */
471 );
472
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000473/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
474 Latin-1 encoded bytes */
475PyAPI_FUNC(PyObject*) PyUnicode_FromString(
476 const char *u /* string */
477 );
478
Guido van Rossumd8225182000-03-10 22:33:05 +0000479/* Return a read-only pointer to the Unicode object's internal
480 Py_UNICODE buffer. */
481
Mark Hammond91a681d2002-08-12 07:21:58 +0000482PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000483 PyObject *unicode /* Unicode object */
484 );
485
486/* Get the length of the Unicode object. */
487
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
Guido van Rossumd8225182000-03-10 22:33:05 +0000489 PyObject *unicode /* Unicode object */
490 );
491
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000492/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000493PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000494
Guido van Rossum52c23592000-04-10 13:41:41 +0000495/* Resize an already allocated Unicode object to the new size length.
496
497 *unicode is modified to point to the new (resized) object and 0
498 returned on success.
499
500 This API may only be called by the function which also called the
501 Unicode constructor. The refcount on the object must be 1. Otherwise,
502 an error is returned.
503
504 Error handling is implemented as follows: an exception is set, -1
505 is returned and *unicode left untouched.
506
507*/
508
Mark Hammond91a681d2002-08-12 07:21:58 +0000509PyAPI_FUNC(int) PyUnicode_Resize(
Guido van Rossum52c23592000-04-10 13:41:41 +0000510 PyObject **unicode, /* Pointer to the Unicode object */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t length /* New length */
Guido van Rossum52c23592000-04-10 13:41:41 +0000512 );
513
Guido van Rossumd8225182000-03-10 22:33:05 +0000514/* Coerce obj to an Unicode object and return a reference with
515 *incremented* refcount.
516
517 Coercion is done in the following way:
518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519 1. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000520 under the assumptions that they contain data using the current
521 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000522
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000523 2. All other objects (including Unicode objects) raise an
524 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000525
526 The API returns NULL in case of an error. The caller is responsible
527 for decref'ing the returned objects.
528
529*/
530
Mark Hammond91a681d2002-08-12 07:21:58 +0000531PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000532 register PyObject *obj, /* Object */
533 const char *encoding, /* encoding */
534 const char *errors /* error handling */
535 );
536
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000538 *incremented* refcount.
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539
540 Unicode objects are passed back as-is (subclasses are converted to
541 true Unicode objects), all other objects are delegated to
542 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
543 using the default encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000544
545 The API returns NULL in case of an error. The caller is responsible
546 for decref'ing the returned objects.
547
548*/
549
Mark Hammond91a681d2002-08-12 07:21:58 +0000550PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Guido van Rossumd8225182000-03-10 22:33:05 +0000551 register PyObject *obj /* Object */
552 );
553
Walter Dörwaldd2034312007-05-18 16:29:38 +0000554PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
555PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
556
Walter Dörwald16807132007-05-25 13:52:07 +0000557PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
558PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
559PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
560PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
561
562/* Use only if you know it's a string */
563#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
564
Guido van Rossumd8225182000-03-10 22:33:05 +0000565/* --- wchar_t support for platforms which support it --------------------- */
566
567#ifdef HAVE_WCHAR_H
568
569/* Create a Unicode Object from the whcar_t buffer w of the given
570 size.
571
572 The buffer is copied into the new object. */
573
Mark Hammond91a681d2002-08-12 07:21:58 +0000574PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000575 register const wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000577 );
578
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000579/* Copies the Unicode Object contents into the wchar_t buffer w. At
Guido van Rossumd8225182000-03-10 22:33:05 +0000580 most size wchar_t characters are copied.
581
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000582 Note that the resulting wchar_t string may or may not be
583 0-terminated. It is the responsibility of the caller to make sure
584 that the wchar_t string is 0-terminated in case this is required by
585 the application.
586
587 Returns the number of wchar_t characters copied (excluding a
588 possibly trailing 0-termination character) or -1 in case of an
Guido van Rossumd8225182000-03-10 22:33:05 +0000589 error. */
590
Martin v. Löwis18e16552006-02-15 17:27:45 +0000591PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000592 PyUnicodeObject *unicode, /* Unicode object */
593 register wchar_t *w, /* wchar_t buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000594 Py_ssize_t size /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000595 );
596
597#endif
598
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000599/* --- Unicode ordinals --------------------------------------------------- */
600
601/* Create a Unicode Object from the given Unicode code point ordinal.
602
603 The ordinal must be in range(0x10000) on narrow Python builds
604 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
605 raised in case it is not.
606
607*/
608
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000609PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000610
Guido van Rossumd8225182000-03-10 22:33:05 +0000611/* === Builtin Codecs =====================================================
612
613 Many of these APIs take two arguments encoding and errors. These
614 parameters encoding and errors have the same semantics as the ones
615 of the builtin unicode() API.
616
Fred Drakecb093fe2000-05-09 19:51:53 +0000617 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000618
619 Error handling is set by errors which may also be set to NULL
620 meaning to use the default handling defined for the codec. Default
621 error handling for all builtin codecs is "strict" (ValueErrors are
622 raised).
623
624 The codecs all use a similar interface. Only deviation from the
625 generic ones are documented.
626
627*/
628
Fred Drakecb093fe2000-05-09 19:51:53 +0000629/* --- Manage the default encoding ---------------------------------------- */
630
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000631/* Return a Python string holding the default encoded value of the
632 Unicode object.
633
634 The resulting string is cached in the Unicode object for subsequent
635 usage by this function. The cached version is needed to implement
636 the character buffer interface and will live (at least) as long as
637 the Unicode object itself.
638
639 The refcount of the string is *not* incremented.
640
641 *** Exported for internal use by the interpreter only !!! ***
642
643*/
644
Mark Hammond91a681d2002-08-12 07:21:58 +0000645PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000646 PyObject *, const char *);
647
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000648/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
649
650 If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
651 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
652 invalid characters with '?'.
653
654 The function is intended to be used for paths and file names only
655 during bootstrapping process where the codecs are not set up.
656*/
657
658PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
659 const char *s /* encoded string */
660 );
661
Christian Heimes5894ba72007-11-04 11:43:14 +0000662PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
663 const char *s, /* encoded string */
664 Py_ssize_t size /* size */
665 );
666
667
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000668/* Return a char* holding the UTF-8 encoded value of the
669 Unicode object.
670
671 DEPRECATED: use PyUnicode_AsStringAndSize() instead.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000672*/
673
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000674PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *);
675
676/* Returns the UTF-8 encoding, and its size.
677
678 If the output argument is NULL, no size is stored.
679 */
680
Martin v. Löwis5b222132007-06-10 09:51:05 +0000681PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*);
682
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000683/* Returns the UTF-8 encoding.
Martin v. Löwis5b222132007-06-10 09:51:05 +0000684
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000685 This is equivalent to PyUnicode_AsStringAndSize(x, NULL).
Fred Drakecb093fe2000-05-09 19:51:53 +0000686
Fred Drakecb093fe2000-05-09 19:51:53 +0000687 */
688
Mark Hammond91a681d2002-08-12 07:21:58 +0000689PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000690
691/* Sets the currently active default encoding.
692
693 Returns 0 on success, -1 in case of an error.
694
695 */
696
Mark Hammond91a681d2002-08-12 07:21:58 +0000697PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
Fred Drakecb093fe2000-05-09 19:51:53 +0000698 const char *encoding /* Encoding name in standard form */
699 );
700
Guido van Rossumd8225182000-03-10 22:33:05 +0000701/* --- Generic Codecs ----------------------------------------------------- */
702
703/* Create a Unicode object by decoding the encoded string s of the
704 given size. */
705
Mark Hammond91a681d2002-08-12 07:21:58 +0000706PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000707 const char *s, /* encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000708 Py_ssize_t size, /* size of buffer */
Guido van Rossumd8225182000-03-10 22:33:05 +0000709 const char *encoding, /* encoding */
710 const char *errors /* error handling */
711 );
712
713/* Encodes a Py_UNICODE buffer of the given size and returns a
714 Python string object. */
715
Mark Hammond91a681d2002-08-12 07:21:58 +0000716PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000717 const Py_UNICODE *s, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000718 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000719 const char *encoding, /* encoding */
720 const char *errors /* error handling */
721 );
722
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000723/* Encodes a Unicode object and returns the result as Python
724 object. */
725
726PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
727 PyObject *unicode, /* Unicode object */
728 const char *encoding, /* encoding */
729 const char *errors /* error handling */
730 );
731
Guido van Rossumd8225182000-03-10 22:33:05 +0000732/* Encodes a Unicode object and returns the result as Python string
733 object. */
734
Mark Hammond91a681d2002-08-12 07:21:58 +0000735PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000736 PyObject *unicode, /* Unicode object */
737 const char *encoding, /* encoding */
738 const char *errors /* error handling */
739 );
740
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000741PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
742 PyObject* string /* 256 character map */
743 );
744
745
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000746/* --- UTF-7 Codecs ------------------------------------------------------- */
747
Mark Hammond91a681d2002-08-12 07:21:58 +0000748PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000749 const char *string, /* UTF-7 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000750 Py_ssize_t length, /* size of string */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000751 const char *errors /* error handling */
752 );
753
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000754PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
755 const char *string, /* UTF-7 encoded string */
756 Py_ssize_t length, /* size of string */
757 const char *errors, /* error handling */
758 Py_ssize_t *consumed /* bytes consumed */
759 );
760
Mark Hammond91a681d2002-08-12 07:21:58 +0000761PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000762 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000763 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000764 int encodeSetO, /* force the encoder to encode characters in
765 Set O, as described in RFC2152 */
766 int encodeWhiteSpace, /* force the encoder to encode space, tab,
767 carriage return and linefeed characters */
768 const char *errors /* error handling */
769 );
770
Guido van Rossumd8225182000-03-10 22:33:05 +0000771/* --- UTF-8 Codecs ------------------------------------------------------- */
772
Mark Hammond91a681d2002-08-12 07:21:58 +0000773PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000774 const char *string, /* UTF-8 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000775 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000776 const char *errors /* error handling */
777 );
778
Walter Dörwald69652032004-09-07 20:24:22 +0000779PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
780 const char *string, /* UTF-8 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000781 Py_ssize_t length, /* size of string */
Walter Dörwald69652032004-09-07 20:24:22 +0000782 const char *errors, /* error handling */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000783 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000784 );
785
Mark Hammond91a681d2002-08-12 07:21:58 +0000786PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000787 PyObject *unicode /* Unicode object */
788 );
789
Mark Hammond91a681d2002-08-12 07:21:58 +0000790PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000791 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000792 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000793 const char *errors /* error handling */
794 );
795
Walter Dörwald41980ca2007-08-16 21:55:45 +0000796/* --- UTF-32 Codecs ------------------------------------------------------ */
797
798/* Decodes length bytes from a UTF-32 encoded buffer string and returns
799 the corresponding Unicode object.
800
801 errors (if non-NULL) defines the error handling. It defaults
802 to "strict".
803
804 If byteorder is non-NULL, the decoder starts decoding using the
805 given byte order:
806
807 *byteorder == -1: little endian
808 *byteorder == 0: native order
809 *byteorder == 1: big endian
810
811 In native mode, the first four bytes of the stream are checked for a
812 BOM mark. If found, the BOM mark is analysed, the byte order
813 adjusted and the BOM skipped. In the other modes, no BOM mark
814 interpretation is done. After completion, *byteorder is set to the
815 current byte order at the end of input data.
816
817 If byteorder is NULL, the codec starts in native order mode.
818
819*/
820
821PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
822 const char *string, /* UTF-32 encoded string */
823 Py_ssize_t length, /* size of string */
824 const char *errors, /* error handling */
825 int *byteorder /* pointer to byteorder to use
826 0=native;-1=LE,1=BE; updated on
827 exit */
828 );
829
830PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
831 const char *string, /* UTF-32 encoded string */
832 Py_ssize_t length, /* size of string */
833 const char *errors, /* error handling */
834 int *byteorder, /* pointer to byteorder to use
835 0=native;-1=LE,1=BE; updated on
836 exit */
837 Py_ssize_t *consumed /* bytes consumed */
838 );
839
840/* Returns a Python string using the UTF-32 encoding in native byte
841 order. The string always starts with a BOM mark. */
842
843PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
844 PyObject *unicode /* Unicode object */
845 );
846
847/* Returns a Python string object holding the UTF-32 encoded value of
848 the Unicode data.
849
850 If byteorder is not 0, output is written according to the following
851 byte order:
852
853 byteorder == -1: little endian
854 byteorder == 0: native byte order (writes a BOM mark)
855 byteorder == 1: big endian
856
857 If byteorder is 0, the output string will always start with the
858 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
859 prepended.
860
861*/
862
863PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
864 const Py_UNICODE *data, /* Unicode char buffer */
865 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
866 const char *errors, /* error handling */
867 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
868 );
869
Guido van Rossumd8225182000-03-10 22:33:05 +0000870/* --- UTF-16 Codecs ------------------------------------------------------ */
871
Guido van Rossum9e896b32000-04-05 20:11:21 +0000872/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000873 the corresponding Unicode object.
874
875 errors (if non-NULL) defines the error handling. It defaults
876 to "strict".
877
878 If byteorder is non-NULL, the decoder starts decoding using the
879 given byte order:
880
881 *byteorder == -1: little endian
882 *byteorder == 0: native order
883 *byteorder == 1: big endian
884
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000885 In native mode, the first two bytes of the stream are checked for a
886 BOM mark. If found, the BOM mark is analysed, the byte order
887 adjusted and the BOM skipped. In the other modes, no BOM mark
888 interpretation is done. After completion, *byteorder is set to the
889 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000890
891 If byteorder is NULL, the codec starts in native order mode.
892
893*/
894
Mark Hammond91a681d2002-08-12 07:21:58 +0000895PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000896 const char *string, /* UTF-16 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000898 const char *errors, /* error handling */
899 int *byteorder /* pointer to byteorder to use
900 0=native;-1=LE,1=BE; updated on
901 exit */
902 );
903
Walter Dörwald69652032004-09-07 20:24:22 +0000904PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
905 const char *string, /* UTF-16 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000906 Py_ssize_t length, /* size of string */
Walter Dörwald69652032004-09-07 20:24:22 +0000907 const char *errors, /* error handling */
908 int *byteorder, /* pointer to byteorder to use
909 0=native;-1=LE,1=BE; updated on
910 exit */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000911 Py_ssize_t *consumed /* bytes consumed */
Walter Dörwald69652032004-09-07 20:24:22 +0000912 );
913
Guido van Rossumd8225182000-03-10 22:33:05 +0000914/* Returns a Python string using the UTF-16 encoding in native byte
915 order. The string always starts with a BOM mark. */
916
Mark Hammond91a681d2002-08-12 07:21:58 +0000917PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000918 PyObject *unicode /* Unicode object */
919 );
920
921/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000922 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000923
924 If byteorder is not 0, output is written according to the following
925 byte order:
926
927 byteorder == -1: little endian
928 byteorder == 0: native byte order (writes a BOM mark)
929 byteorder == 1: big endian
930
931 If byteorder is 0, the output string will always start with the
932 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
933 prepended.
934
935 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
936 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000937 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000938
939*/
940
Mark Hammond91a681d2002-08-12 07:21:58 +0000941PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000942 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000943 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000944 const char *errors, /* error handling */
945 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
946 );
947
948/* --- Unicode-Escape Codecs ---------------------------------------------- */
949
Mark Hammond91a681d2002-08-12 07:21:58 +0000950PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000951 const char *string, /* Unicode-Escape encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000953 const char *errors /* error handling */
954 );
955
Mark Hammond91a681d2002-08-12 07:21:58 +0000956PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000957 PyObject *unicode /* Unicode object */
958 );
959
Mark Hammond91a681d2002-08-12 07:21:58 +0000960PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000961 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000962 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000963 );
964
965/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
966
Mark Hammond91a681d2002-08-12 07:21:58 +0000967PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000968 const char *string, /* Raw-Unicode-Escape encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000969 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +0000970 const char *errors /* error handling */
971 );
972
Mark Hammond91a681d2002-08-12 07:21:58 +0000973PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000974 PyObject *unicode /* Unicode object */
975 );
976
Mark Hammond91a681d2002-08-12 07:21:58 +0000977PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000978 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +0000979 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +0000980 );
981
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000982/* --- Unicode Internal Codec ---------------------------------------------
983
984 Only for internal use in _codecsmodule.c */
985
986PyObject *_PyUnicode_DecodeUnicodeInternal(
987 const char *string,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000988 Py_ssize_t length,
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000989 const char *errors
990 );
991
Guido van Rossumd8225182000-03-10 22:33:05 +0000992/* --- Latin-1 Codecs -----------------------------------------------------
993
994 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
995
996*/
997
Mark Hammond91a681d2002-08-12 07:21:58 +0000998PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +0000999 const char *string, /* Latin-1 encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001001 const char *errors /* error handling */
1002 );
1003
Mark Hammond91a681d2002-08-12 07:21:58 +00001004PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Guido van Rossumd8225182000-03-10 22:33:05 +00001005 PyObject *unicode /* Unicode object */
1006 );
1007
Mark Hammond91a681d2002-08-12 07:21:58 +00001008PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +00001009 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001010 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001011 const char *errors /* error handling */
1012 );
1013
1014/* --- ASCII Codecs -------------------------------------------------------
1015
1016 Only 7-bit ASCII data is excepted. All other codes generate errors.
1017
1018*/
1019
Mark Hammond91a681d2002-08-12 07:21:58 +00001020PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 const char *string, /* ASCII encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001023 const char *errors /* error handling */
1024 );
1025
Mark Hammond91a681d2002-08-12 07:21:58 +00001026PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Guido van Rossumd8225182000-03-10 22:33:05 +00001027 PyObject *unicode /* Unicode object */
1028 );
1029
Mark Hammond91a681d2002-08-12 07:21:58 +00001030PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +00001031 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001032 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001033 const char *errors /* error handling */
1034 );
1035
1036/* --- Character Map Codecs -----------------------------------------------
1037
1038 This codec uses mappings to encode and decode characters.
1039
1040 Decoding mappings must map single string characters to single
1041 Unicode characters, integers (which are then interpreted as Unicode
1042 ordinals) or None (meaning "undefined mapping" and causing an
1043 error).
1044
1045 Encoding mappings must map single Unicode characters to single
1046 string characters, integers (which are then interpreted as Latin-1
1047 ordinals) or None (meaning "undefined mapping" and causing an
1048 error).
1049
1050 If a character lookup fails with a LookupError, the character is
1051 copied as-is meaning that its ordinal value will be interpreted as
1052 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1053 to contain those mappings which map characters to different code
1054 points.
1055
1056*/
1057
Mark Hammond91a681d2002-08-12 07:21:58 +00001058PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001059 const char *string, /* Encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001060 Py_ssize_t length, /* size of string */
Guido van Rossumd8225182000-03-10 22:33:05 +00001061 PyObject *mapping, /* character mapping
1062 (char ordinal -> unicode ordinal) */
1063 const char *errors /* error handling */
1064 );
1065
Mark Hammond91a681d2002-08-12 07:21:58 +00001066PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Guido van Rossumd8225182000-03-10 22:33:05 +00001067 PyObject *unicode, /* Unicode object */
1068 PyObject *mapping /* character mapping
1069 (unicode ordinal -> char ordinal) */
1070 );
1071
Mark Hammond91a681d2002-08-12 07:21:58 +00001072PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001073 const Py_UNICODE *data, /* Unicode char buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001074 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001075 PyObject *mapping, /* character mapping
1076 (unicode ordinal -> char ordinal) */
1077 const char *errors /* error handling */
1078 );
1079
1080/* Translate a Py_UNICODE buffer of the given length by applying a
1081 character mapping table to it and return the resulting Unicode
1082 object.
1083
1084 The mapping table must map Unicode ordinal integers to Unicode
1085 ordinal integers or None (causing deletion of the character).
1086
1087 Mapping tables may be dictionaries or sequences. Unmapped character
1088 ordinals (ones which cause a LookupError) are left untouched and
1089 are copied as-is.
1090
1091*/
1092
Mark Hammond91a681d2002-08-12 07:21:58 +00001093PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +00001094 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001095 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumd8225182000-03-10 22:33:05 +00001096 PyObject *table, /* Translate table */
1097 const char *errors /* error handling */
1098 );
1099
Guido van Rossumefec1152000-03-28 02:01:15 +00001100#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +00001101
Guido van Rossumefec1152000-03-28 02:01:15 +00001102/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001103
Mark Hammond91a681d2002-08-12 07:21:58 +00001104PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001105 const char *string, /* MBCS encoded string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001106 Py_ssize_t length, /* size of string */
Guido van Rossumefec1152000-03-28 02:01:15 +00001107 const char *errors /* error handling */
1108 );
1109
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001110PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1111 const char *string, /* MBCS encoded string */
1112 Py_ssize_t length, /* size of string */
1113 const char *errors, /* error handling */
1114 Py_ssize_t *consumed /* bytes consumed */
1115 );
1116
Mark Hammond91a681d2002-08-12 07:21:58 +00001117PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +00001118 PyObject *unicode /* Unicode object */
1119 );
1120
Mark Hammond91a681d2002-08-12 07:21:58 +00001121PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +00001122 const Py_UNICODE *data, /* Unicode char buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001123 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossumefec1152000-03-28 02:01:15 +00001124 const char *errors /* error handling */
1125 );
1126
Guido van Rossumefec1152000-03-28 02:01:15 +00001127#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +00001128
Guido van Rossum9e896b32000-04-05 20:11:21 +00001129/* --- Decimal Encoder ---------------------------------------------------- */
1130
1131/* Takes a Unicode string holding a decimal value and writes it into
1132 an output buffer using standard ASCII digit codes.
1133
1134 The output buffer has to provide at least length+1 bytes of storage
1135 area. The output string is 0-terminated.
1136
1137 The encoder converts whitespace to ' ', decimal characters to their
1138 corresponding ASCII digit and all other Latin-1 characters except
1139 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1140 are treated as errors. This includes embedded NULL bytes.
1141
1142 Error handling is defined by the errors argument:
1143
1144 NULL or "strict": raise a ValueError
1145 "ignore": ignore the wrong characters (these are not copied to the
1146 output buffer)
1147 "replace": replaces illegal characters with '?'
1148
1149 Returns 0 on success, -1 on failure.
1150
1151*/
1152
Mark Hammond91a681d2002-08-12 07:21:58 +00001153PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Guido van Rossum9e896b32000-04-05 20:11:21 +00001154 Py_UNICODE *s, /* Unicode buffer */
Neal Norwitzd78f6cf2007-08-08 04:49:37 +00001155 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Guido van Rossum9e896b32000-04-05 20:11:21 +00001156 char *output, /* Output buffer; must have size >= length */
1157 const char *errors /* error handling */
1158 );
1159
Guido van Rossumd8225182000-03-10 22:33:05 +00001160/* --- Methods & Slots ----------------------------------------------------
1161
1162 These are capable of handling Unicode objects and strings on input
1163 (we refer to them as strings in the descriptions) and return
1164 Unicode objects or integers as apporpriate. */
1165
1166/* Concat two strings giving a new Unicode string. */
1167
Mark Hammond91a681d2002-08-12 07:21:58 +00001168PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Guido van Rossumd8225182000-03-10 22:33:05 +00001169 PyObject *left, /* Left string */
1170 PyObject *right /* Right string */
1171 );
1172
Walter Dörwald1ab83302007-05-18 17:15:44 +00001173/* Concat two strings and put the result in *pleft
1174 (sets *pleft to NULL on error) */
1175
1176PyAPI_FUNC(void) PyUnicode_Append(
1177 PyObject **pleft, /* Pointer to left string */
1178 PyObject *right /* Right string */
1179 );
1180
1181/* Concat two strings, put the result in *pleft and drop the right object
1182 (sets *pleft to NULL on error) */
1183
1184PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1185 PyObject **pleft, /* Pointer to left string */
1186 PyObject *right /* Right string */
1187 );
1188
Guido van Rossumd8225182000-03-10 22:33:05 +00001189/* Split a string giving a list of Unicode strings.
1190
1191 If sep is NULL, splitting will be done at all whitespace
1192 substrings. Otherwise, splits occur at the given separator.
1193
1194 At most maxsplit splits will be done. If negative, no limit is set.
1195
1196 Separators are not included in the resulting list.
1197
1198*/
1199
Mark Hammond91a681d2002-08-12 07:21:58 +00001200PyAPI_FUNC(PyObject*) PyUnicode_Split(
Guido van Rossumd8225182000-03-10 22:33:05 +00001201 PyObject *s, /* String to split */
1202 PyObject *sep, /* String separator */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t maxsplit /* Maxsplit count */
Guido van Rossumd8225182000-03-10 22:33:05 +00001204 );
1205
1206/* Dito, but split at line breaks.
1207
1208 CRLF is considered to be one line break. Line breaks are not
1209 included in the resulting list. */
1210
Mark Hammond91a681d2002-08-12 07:21:58 +00001211PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Guido van Rossumd8225182000-03-10 22:33:05 +00001212 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +00001213 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +00001214 );
1215
Thomas Wouters477c8d52006-05-27 19:21:47 +00001216/* Partition a string using a given separator. */
1217
1218PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1219 PyObject *s, /* String to partition */
1220 PyObject *sep /* String separator */
1221 );
1222
1223/* Partition a string using a given separator, searching from the end of the
1224 string. */
1225
1226PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1227 PyObject *s, /* String to partition */
1228 PyObject *sep /* String separator */
1229 );
1230
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001231/* Split a string giving a list of Unicode strings.
1232
1233 If sep is NULL, splitting will be done at all whitespace
1234 substrings. Otherwise, splits occur at the given separator.
1235
1236 At most maxsplit splits will be done. But unlike PyUnicode_Split
1237 PyUnicode_RSplit splits from the end of the string. If negative,
1238 no limit is set.
1239
1240 Separators are not included in the resulting list.
1241
1242*/
1243
1244PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1245 PyObject *s, /* String to split */
1246 PyObject *sep, /* String separator */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001247 Py_ssize_t maxsplit /* Maxsplit count */
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00001248 );
1249
Guido van Rossumd8225182000-03-10 22:33:05 +00001250/* Translate a string by applying a character mapping table to it and
1251 return the resulting Unicode object.
1252
1253 The mapping table must map Unicode ordinal integers to Unicode
1254 ordinal integers or None (causing deletion of the character).
1255
1256 Mapping tables may be dictionaries or sequences. Unmapped character
1257 ordinals (ones which cause a LookupError) are left untouched and
1258 are copied as-is.
1259
1260*/
1261
Mark Hammond91a681d2002-08-12 07:21:58 +00001262PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Guido van Rossumd8225182000-03-10 22:33:05 +00001263 PyObject *str, /* String */
1264 PyObject *table, /* Translate table */
1265 const char *errors /* error handling */
1266 );
1267
1268/* Join a sequence of strings using the given separator and return
1269 the resulting Unicode string. */
1270
Mark Hammond91a681d2002-08-12 07:21:58 +00001271PyAPI_FUNC(PyObject*) PyUnicode_Join(
Guido van Rossumd8225182000-03-10 22:33:05 +00001272 PyObject *separator, /* Separator string */
1273 PyObject *seq /* Sequence object */
1274 );
1275
1276/* Return 1 if substr matches str[start:end] at the given tail end, 0
1277 otherwise. */
1278
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
Guido van Rossumd8225182000-03-10 22:33:05 +00001280 PyObject *str, /* String */
1281 PyObject *substr, /* Prefix or Suffix string */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001282 Py_ssize_t start, /* Start index */
1283 Py_ssize_t end, /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001284 int direction /* Tail end: -1 prefix, +1 suffix */
1285 );
1286
1287/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001288 given search direction or -1 if not found. -2 is returned in case
1289 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001290
Martin v. Löwis18e16552006-02-15 17:27:45 +00001291PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
Guido van Rossumd8225182000-03-10 22:33:05 +00001292 PyObject *str, /* String */
1293 PyObject *substr, /* Substring to find */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001294 Py_ssize_t start, /* Start index */
1295 Py_ssize_t end, /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001296 int direction /* Find direction: +1 forward, -1 backward */
1297 );
1298
Barry Warsaw51ac5802000-03-20 16:36:48 +00001299/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001300
Martin v. Löwis18e16552006-02-15 17:27:45 +00001301PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
Guido van Rossumd8225182000-03-10 22:33:05 +00001302 PyObject *str, /* String */
1303 PyObject *substr, /* Substring to count */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001304 Py_ssize_t start, /* Start index */
1305 Py_ssize_t end /* Stop index */
Guido van Rossumd8225182000-03-10 22:33:05 +00001306 );
1307
Barry Warsaw51ac5802000-03-20 16:36:48 +00001308/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001309 and return the resulting Unicode object. */
1310
Mark Hammond91a681d2002-08-12 07:21:58 +00001311PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Guido van Rossumd8225182000-03-10 22:33:05 +00001312 PyObject *str, /* String */
1313 PyObject *substr, /* Substring to find */
1314 PyObject *replstr, /* Substring to replace */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001315 Py_ssize_t maxcount /* Max. number of replacements to apply;
Guido van Rossumd8225182000-03-10 22:33:05 +00001316 -1 = all */
1317 );
1318
1319/* Compare two strings and return -1, 0, 1 for less than, equal,
1320 greater than resp. */
1321
Mark Hammond91a681d2002-08-12 07:21:58 +00001322PyAPI_FUNC(int) PyUnicode_Compare(
Guido van Rossumd8225182000-03-10 22:33:05 +00001323 PyObject *left, /* Left string */
1324 PyObject *right /* Right string */
1325 );
1326
Martin v. Löwis5b222132007-06-10 09:51:05 +00001327PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1328 PyObject *left,
1329 const char *right
1330 );
1331
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001332/* Rich compare two strings and return one of the following:
1333
1334 - NULL in case an exception was raised
1335 - Py_True or Py_False for successfuly comparisons
1336 - Py_NotImplemented in case the type combination is unknown
1337
1338 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1339 case the conversion of the arguments to Unicode fails with a
1340 UnicodeDecodeError.
1341
1342 Possible values for op:
1343
1344 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1345
1346*/
1347
1348PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1349 PyObject *left, /* Left string */
1350 PyObject *right, /* Right string */
1351 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1352 );
1353
Thomas Wouters7e474022000-07-16 12:04:32 +00001354/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001355 the resulting Unicode string. */
1356
Mark Hammond91a681d2002-08-12 07:21:58 +00001357PyAPI_FUNC(PyObject *) PyUnicode_Format(
Guido van Rossumd8225182000-03-10 22:33:05 +00001358 PyObject *format, /* Format string */
1359 PyObject *args /* Argument tuple or dictionary */
1360 );
1361
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001362/* Checks whether element is contained in container and return 1/0
1363 accordingly.
1364
1365 element has to coerce to an one element Unicode string. -1 is
1366 returned in case of an error. */
1367
Mark Hammond91a681d2002-08-12 07:21:58 +00001368PyAPI_FUNC(int) PyUnicode_Contains(
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001369 PyObject *container, /* Container string */
1370 PyObject *element /* Element string */
1371 );
1372
Martin v. Löwis47383402007-08-15 07:32:56 +00001373/* Checks whether argument is a valid identifier. */
1374
1375PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1376
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001377/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001378PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001379 PyUnicodeObject *self,
1380 int striptype,
1381 PyObject *sepobj
1382 );
1383
Guido van Rossumd8225182000-03-10 22:33:05 +00001384/* === Characters Type APIs =============================================== */
1385
1386/* These should not be used directly. Use the Py_UNICODE_IS* and
1387 Py_UNICODE_TO* macros instead.
1388
1389 These APIs are implemented in Objects/unicodectype.c.
1390
1391*/
1392
Mark Hammond91a681d2002-08-12 07:21:58 +00001393PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001394 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001395 );
1396
Mark Hammond91a681d2002-08-12 07:21:58 +00001397PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001398 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001399 );
1400
Mark Hammond91a681d2002-08-12 07:21:58 +00001401PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001402 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001403 );
1404
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001405PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1406 Py_UNICODE ch /* Unicode character */
1407 );
1408
1409PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1410 Py_UNICODE ch /* Unicode character */
1411 );
1412
Mark Hammond91a681d2002-08-12 07:21:58 +00001413PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Tim Peters2576c972005-10-29 02:33:18 +00001414 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001415 );
1416
Mark Hammond91a681d2002-08-12 07:21:58 +00001417PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Tim Peters2576c972005-10-29 02:33:18 +00001418 const Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001419 );
1420
Mark Hammond91a681d2002-08-12 07:21:58 +00001421PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001422 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001423 );
1424
Mark Hammond91a681d2002-08-12 07:21:58 +00001425PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001426 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001427 );
1428
Mark Hammond91a681d2002-08-12 07:21:58 +00001429PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001430 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001431 );
1432
Mark Hammond91a681d2002-08-12 07:21:58 +00001433PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001434 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001435 );
1436
Mark Hammond91a681d2002-08-12 07:21:58 +00001437PyAPI_FUNC(int) _PyUnicode_ToDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001438 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001439 );
1440
Mark Hammond91a681d2002-08-12 07:21:58 +00001441PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001442 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001443 );
1444
Mark Hammond91a681d2002-08-12 07:21:58 +00001445PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001446 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001447 );
1448
Mark Hammond91a681d2002-08-12 07:21:58 +00001449PyAPI_FUNC(int) _PyUnicode_IsDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001450 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001451 );
1452
Mark Hammond91a681d2002-08-12 07:21:58 +00001453PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001454 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001455 );
1456
Mark Hammond91a681d2002-08-12 07:21:58 +00001457PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001458 Py_UNICODE ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001459 );
1460
Martin v. Löwis5b222132007-06-10 09:51:05 +00001461PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1462
1463PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1464 Py_UNICODE *s1, const Py_UNICODE *s2);
1465
1466PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1467 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1468
1469PyAPI_FUNC(int) Py_UNICODE_strcmp(
1470 const Py_UNICODE *s1, const Py_UNICODE *s2);
1471
1472PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1473 const Py_UNICODE *s, Py_UNICODE c
1474 );
1475
Guido van Rossumd8225182000-03-10 22:33:05 +00001476#ifdef __cplusplus
1477}
1478#endif
Guido van Rossumd8225182000-03-10 22:33:05 +00001479#endif /* !Py_UNICODEOBJECT_H */