blob: 24e59de4ce00fe52c77ea3a804869f5a3ae7d1aa [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +000055#include <ctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +000056
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Martin v. Löwis339d0f72001-08-17 18:39:25 +000061#ifndef Py_USING_UNICODE
62
63#define PyUnicode_Check(op) 0
Tim Peters78e0fc72001-09-11 03:07:38 +000064#define PyUnicode_CheckExact(op) 0
Martin v. Löwis339d0f72001-08-17 18:39:25 +000065
66#else
67
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000068/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
69 properly set, but the default rules below doesn't set it. I'll
70 sort this out some other day -- fredrik@pythonware.com */
71
72#ifndef Py_UNICODE_SIZE
73#error Must define Py_UNICODE_SIZE
74#endif
75
Fredrik Lundh8f455852001-06-27 18:59:43 +000076/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
77 strings are stored as UCS-2 (with limited support for UTF-16) */
78
79#if Py_UNICODE_SIZE >= 4
80#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000081#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000082
Guido van Rossumd8225182000-03-10 22:33:05 +000083/* Set these flags if the platform has "wchar.h", "wctype.h" and the
84 wchar_t type is a 16-bit unsigned type */
85/* #define HAVE_WCHAR_H */
86/* #define HAVE_USABLE_WCHAR_T */
87
88/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000089#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000090
Fredrik Lundh1294ad02001-06-26 17:17:07 +000091/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000092# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000093# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000094# define PY_UNICODE_TYPE wchar_t
95# endif
96
Fredrik Lundh8f455852001-06-27 18:59:43 +000097# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000098# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000099# endif
100
101#endif
102
103/* If the compiler provides a wchar_t type we try to support it
104 through the interface functions PyUnicode_FromWideChar() and
105 PyUnicode_AsWideChar(). */
106
107#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000108# ifndef HAVE_WCHAR_H
109# define HAVE_WCHAR_H
110# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000111#endif
112
113#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000114/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
115# ifdef _HAVE_BSDI
116# include <time.h>
117# endif
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000118# include <wchar.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000119#endif
120
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000121/*
122 * Use this typedef when you need to represent a UTF-16 surrogate pair
123 * as single unsigned integer.
124 */
125#if SIZEOF_INT >= 4
126typedef unsigned int Py_UCS4;
127#elif SIZEOF_LONG >= 4
128typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000129#endif
130
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000131typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000132
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000133/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
134
135/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
136 produce different external names and thus cause import errors in
137 case Python interpreters and extensions with mixed compiled in
138 Unicode width assumptions are combined. */
139
140#ifndef Py_UNICODE_WIDE
141
142# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
143# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
144# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
145# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
146# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
152# define PyUnicode_Compare PyUnicodeUCS2_Compare
153# define PyUnicode_Concat PyUnicodeUCS2_Concat
154# define PyUnicode_Contains PyUnicodeUCS2_Contains
155# define PyUnicode_Count PyUnicodeUCS2_Count
156# define PyUnicode_Decode PyUnicodeUCS2_Decode
157# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
158# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
159# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
160# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
161# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
162# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
163# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
164# define PyUnicode_Encode PyUnicodeUCS2_Encode
165# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
166# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
167# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
168# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
169# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
170# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
171# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
172# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
173# define PyUnicode_Find PyUnicodeUCS2_Find
174# define PyUnicode_Format PyUnicodeUCS2_Format
175# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
176# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000177# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000178# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
179# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
180# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
181# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
182# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
183# define PyUnicode_Join PyUnicodeUCS2_Join
184# define PyUnicode_Replace PyUnicodeUCS2_Replace
185# define PyUnicode_Resize PyUnicodeUCS2_Resize
186# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
187# define PyUnicode_Split PyUnicodeUCS2_Split
188# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
189# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
190# define PyUnicode_Translate PyUnicodeUCS2_Translate
191# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
192# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
193# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
194# define _PyUnicode_Init _PyUnicodeUCS2_Init
195# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
196# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
197# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
198# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
199# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
200# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
201# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
202# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
203# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
204# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
205# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
206# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
207# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
208# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
209# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
210
211#else
212
213# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
214# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
215# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
216# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
217# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
218# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
219# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
220# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
221# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
222# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
223# define PyUnicode_Compare PyUnicodeUCS4_Compare
224# define PyUnicode_Concat PyUnicodeUCS4_Concat
225# define PyUnicode_Contains PyUnicodeUCS4_Contains
226# define PyUnicode_Count PyUnicodeUCS4_Count
227# define PyUnicode_Decode PyUnicodeUCS4_Decode
228# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
229# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
230# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
231# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
232# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
233# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
234# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
235# define PyUnicode_Encode PyUnicodeUCS4_Encode
236# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
237# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
238# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
239# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
240# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
241# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
242# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
243# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
244# define PyUnicode_Find PyUnicodeUCS4_Find
245# define PyUnicode_Format PyUnicodeUCS4_Format
246# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
247# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000248# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
Marc-André Lemburgb5ac6f62001-07-31 14:30:16 +0000249# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
250# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
251# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
252# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
253# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
254# define PyUnicode_Join PyUnicodeUCS4_Join
255# define PyUnicode_Replace PyUnicodeUCS4_Replace
256# define PyUnicode_Resize PyUnicodeUCS4_Resize
257# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
258# define PyUnicode_Split PyUnicodeUCS4_Split
259# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
260# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
261# define PyUnicode_Translate PyUnicodeUCS4_Translate
262# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
263# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
264# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
265# define _PyUnicode_Init _PyUnicodeUCS4_Init
266# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
267# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
268# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
269# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
270# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
271# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
272# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
273# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
274# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
275# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
276# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
277# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
278# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
279# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
280# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
281
282
283#endif
284
Guido van Rossumd8225182000-03-10 22:33:05 +0000285/* --- Internal Unicode Operations ---------------------------------------- */
286
287/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000288 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
289 configure Python using --with-ctype-functions. This reduces the
290 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000291
292#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
293
Marc-André Lemburg5e6007c2001-09-19 11:21:03 +0000294#include <wctype.h>
Guido van Rossumd8225182000-03-10 22:33:05 +0000295
296#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
297
298#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
299#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
300#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
301#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
302
303#define Py_UNICODE_TOLOWER(ch) towlower(ch)
304#define Py_UNICODE_TOUPPER(ch) towupper(ch)
305#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
306
307#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
308#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
309#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
310
311#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
312#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
313#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
314
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000315#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
316
Guido van Rossumd8225182000-03-10 22:33:05 +0000317#else
318
319#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
320
321#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
322#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
323#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
324#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
325
326#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
327#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
328#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
329
330#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
331#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
332#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
333
334#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
335#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
336#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
337
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000338#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000339
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000340#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000341
342#define Py_UNICODE_ISALNUM(ch) \
343 (Py_UNICODE_ISALPHA(ch) || \
344 Py_UNICODE_ISDECIMAL(ch) || \
345 Py_UNICODE_ISDIGIT(ch) || \
346 Py_UNICODE_ISNUMERIC(ch))
347
Guido van Rossumd8225182000-03-10 22:33:05 +0000348#define Py_UNICODE_COPY(target, source, length)\
349 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
350
351#define Py_UNICODE_FILL(target, value, length) do\
352 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
353 while (0)
354
355#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000356 ((*((string)->str + (offset)) == *((substring)->str)) &&\
357 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000358 (substring)->length*sizeof(Py_UNICODE)))
359
Barry Warsaw51ac5802000-03-20 16:36:48 +0000360#ifdef __cplusplus
361extern "C" {
362#endif
363
Guido van Rossumd8225182000-03-10 22:33:05 +0000364/* --- Unicode Type ------------------------------------------------------- */
365
366typedef struct {
367 PyObject_HEAD
368 int length; /* Length of raw Unicode data in buffer */
369 Py_UNICODE *str; /* Raw Unicode buffer */
370 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000371 PyObject *defenc; /* (Default) Encoded version as Python
372 string, or NULL; this is used for
373 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000374} PyUnicodeObject;
375
Mark Hammond91a681d2002-08-12 07:21:58 +0000376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
Guido van Rossumd8225182000-03-10 22:33:05 +0000377
Guido van Rossum5eef77a2001-08-30 03:08:07 +0000378#define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type)
Tim Peters78e0fc72001-09-11 03:07:38 +0000379#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
Guido van Rossumd8225182000-03-10 22:33:05 +0000380
381/* Fast access macros */
382#define PyUnicode_GET_SIZE(op) \
383 (((PyUnicodeObject *)(op))->length)
384#define PyUnicode_GET_DATA_SIZE(op) \
385 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
386#define PyUnicode_AS_UNICODE(op) \
387 (((PyUnicodeObject *)(op))->str)
388#define PyUnicode_AS_DATA(op) \
389 ((const char *)((PyUnicodeObject *)(op))->str)
390
391/* --- Constants ---------------------------------------------------------- */
392
393/* This Unicode character will be used as replacement character during
394 decoding if the errors argument is set to "replace". Note: the
395 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
396 Unicode 3.0. */
397
398#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
399
400/* === Public API ========================================================= */
401
402/* --- Plain Py_UNICODE --------------------------------------------------- */
403
404/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 size.
406
407 u may be NULL which causes the contents to be undefined. It is the
408 user's responsibility to fill in the needed data afterwards. Note
409 that modifying the Unicode object contents after construction is
410 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000411
412 The buffer is copied into the new object. */
413
Mark Hammond91a681d2002-08-12 07:21:58 +0000414PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000415 const Py_UNICODE *u, /* Unicode buffer */
416 int size /* size of buffer */
417 );
418
419/* Return a read-only pointer to the Unicode object's internal
420 Py_UNICODE buffer. */
421
Mark Hammond91a681d2002-08-12 07:21:58 +0000422PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000423 PyObject *unicode /* Unicode object */
424 );
425
426/* Get the length of the Unicode object. */
427
Mark Hammond91a681d2002-08-12 07:21:58 +0000428PyAPI_FUNC(int) PyUnicode_GetSize(
Guido van Rossumd8225182000-03-10 22:33:05 +0000429 PyObject *unicode /* Unicode object */
430 );
431
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432/* Get the maximum ordinal for a Unicode character. */
Mark Hammond91a681d2002-08-12 07:21:58 +0000433PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000434
Guido van Rossum52c23592000-04-10 13:41:41 +0000435/* Resize an already allocated Unicode object to the new size length.
436
437 *unicode is modified to point to the new (resized) object and 0
438 returned on success.
439
440 This API may only be called by the function which also called the
441 Unicode constructor. The refcount on the object must be 1. Otherwise,
442 an error is returned.
443
444 Error handling is implemented as follows: an exception is set, -1
445 is returned and *unicode left untouched.
446
447*/
448
Mark Hammond91a681d2002-08-12 07:21:58 +0000449PyAPI_FUNC(int) PyUnicode_Resize(
Guido van Rossum52c23592000-04-10 13:41:41 +0000450 PyObject **unicode, /* Pointer to the Unicode object */
451 int length /* New length */
452 );
453
Guido van Rossumd8225182000-03-10 22:33:05 +0000454/* Coerce obj to an Unicode object and return a reference with
455 *incremented* refcount.
456
457 Coercion is done in the following way:
458
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000459 1. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000460 under the assumptions that they contain data using the current
461 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463 2. All other objects (including Unicode objects) raise an
464 exception.
Guido van Rossumd8225182000-03-10 22:33:05 +0000465
466 The API returns NULL in case of an error. The caller is responsible
467 for decref'ing the returned objects.
468
469*/
470
Mark Hammond91a681d2002-08-12 07:21:58 +0000471PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472 register PyObject *obj, /* Object */
473 const char *encoding, /* encoding */
474 const char *errors /* error handling */
475 );
476
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477/* Coerce obj to an Unicode object and return a reference with
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000478 *incremented* refcount.
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479
480 Unicode objects are passed back as-is (subclasses are converted to
481 true Unicode objects), all other objects are delegated to
482 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
483 using the default encoding as basis for decoding the object.
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484
485 The API returns NULL in case of an error. The caller is responsible
486 for decref'ing the returned objects.
487
488*/
489
Mark Hammond91a681d2002-08-12 07:21:58 +0000490PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
Guido van Rossumd8225182000-03-10 22:33:05 +0000491 register PyObject *obj /* Object */
492 );
493
494/* --- wchar_t support for platforms which support it --------------------- */
495
496#ifdef HAVE_WCHAR_H
497
498/* Create a Unicode Object from the whcar_t buffer w of the given
499 size.
500
501 The buffer is copied into the new object. */
502
Mark Hammond91a681d2002-08-12 07:21:58 +0000503PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000504 register const wchar_t *w, /* wchar_t buffer */
505 int size /* size of buffer */
506 );
507
508/* Copies the Unicode Object contents into the whcar_t buffer w. At
509 most size wchar_t characters are copied.
510
511 Returns the number of wchar_t characters copied or -1 in case of an
512 error. */
513
Mark Hammond91a681d2002-08-12 07:21:58 +0000514PyAPI_FUNC(int) PyUnicode_AsWideChar(
Guido van Rossumd8225182000-03-10 22:33:05 +0000515 PyUnicodeObject *unicode, /* Unicode object */
516 register wchar_t *w, /* wchar_t buffer */
517 int size /* size of buffer */
518 );
519
520#endif
521
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000522/* --- Unicode ordinals --------------------------------------------------- */
523
524/* Create a Unicode Object from the given Unicode code point ordinal.
525
526 The ordinal must be in range(0x10000) on narrow Python builds
527 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
528 raised in case it is not.
529
530*/
531
Marc-André Lemburg9c329de2002-08-12 08:19:10 +0000532PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000533
Guido van Rossumd8225182000-03-10 22:33:05 +0000534/* === Builtin Codecs =====================================================
535
536 Many of these APIs take two arguments encoding and errors. These
537 parameters encoding and errors have the same semantics as the ones
538 of the builtin unicode() API.
539
Fred Drakecb093fe2000-05-09 19:51:53 +0000540 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000541
542 Error handling is set by errors which may also be set to NULL
543 meaning to use the default handling defined for the codec. Default
544 error handling for all builtin codecs is "strict" (ValueErrors are
545 raised).
546
547 The codecs all use a similar interface. Only deviation from the
548 generic ones are documented.
549
550*/
551
Fred Drakecb093fe2000-05-09 19:51:53 +0000552/* --- Manage the default encoding ---------------------------------------- */
553
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000554/* Return a Python string holding the default encoded value of the
555 Unicode object.
556
557 The resulting string is cached in the Unicode object for subsequent
558 usage by this function. The cached version is needed to implement
559 the character buffer interface and will live (at least) as long as
560 the Unicode object itself.
561
562 The refcount of the string is *not* incremented.
563
564 *** Exported for internal use by the interpreter only !!! ***
565
566*/
567
Mark Hammond91a681d2002-08-12 07:21:58 +0000568PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
Jeremy Hylton3ce45382001-07-30 22:34:24 +0000569 PyObject *, const char *);
570
Fred Drakecb093fe2000-05-09 19:51:53 +0000571/* Returns the currently active default encoding.
572
573 The default encoding is currently implemented as run-time settable
574 process global. This may change in future versions of the
575 interpreter to become a parameter which is managed on a per-thread
576 basis.
577
578 */
579
Mark Hammond91a681d2002-08-12 07:21:58 +0000580PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000581
582/* Sets the currently active default encoding.
583
584 Returns 0 on success, -1 in case of an error.
585
586 */
587
Mark Hammond91a681d2002-08-12 07:21:58 +0000588PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
Fred Drakecb093fe2000-05-09 19:51:53 +0000589 const char *encoding /* Encoding name in standard form */
590 );
591
Guido van Rossumd8225182000-03-10 22:33:05 +0000592/* --- Generic Codecs ----------------------------------------------------- */
593
594/* Create a Unicode object by decoding the encoded string s of the
595 given size. */
596
Mark Hammond91a681d2002-08-12 07:21:58 +0000597PyAPI_FUNC(PyObject*) PyUnicode_Decode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000598 const char *s, /* encoded string */
599 int size, /* size of buffer */
600 const char *encoding, /* encoding */
601 const char *errors /* error handling */
602 );
603
604/* Encodes a Py_UNICODE buffer of the given size and returns a
605 Python string object. */
606
Mark Hammond91a681d2002-08-12 07:21:58 +0000607PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Guido van Rossumd8225182000-03-10 22:33:05 +0000608 const Py_UNICODE *s, /* Unicode char buffer */
609 int size, /* number of Py_UNICODE chars to encode */
610 const char *encoding, /* encoding */
611 const char *errors /* error handling */
612 );
613
614/* Encodes a Unicode object and returns the result as Python string
615 object. */
616
Mark Hammond91a681d2002-08-12 07:21:58 +0000617PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000618 PyObject *unicode, /* Unicode object */
619 const char *encoding, /* encoding */
620 const char *errors /* error handling */
621 );
622
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000623/* --- UTF-7 Codecs ------------------------------------------------------- */
624
Mark Hammond91a681d2002-08-12 07:21:58 +0000625PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000626 const char *string, /* UTF-7 encoded string */
627 int length, /* size of string */
628 const char *errors /* error handling */
629 );
630
Mark Hammond91a681d2002-08-12 07:21:58 +0000631PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000632 const Py_UNICODE *data, /* Unicode char buffer */
633 int length, /* number of Py_UNICODE chars to encode */
634 int encodeSetO, /* force the encoder to encode characters in
635 Set O, as described in RFC2152 */
636 int encodeWhiteSpace, /* force the encoder to encode space, tab,
637 carriage return and linefeed characters */
638 const char *errors /* error handling */
639 );
640
Guido van Rossumd8225182000-03-10 22:33:05 +0000641/* --- UTF-8 Codecs ------------------------------------------------------- */
642
Mark Hammond91a681d2002-08-12 07:21:58 +0000643PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000644 const char *string, /* UTF-8 encoded string */
645 int length, /* size of string */
646 const char *errors /* error handling */
647 );
648
Mark Hammond91a681d2002-08-12 07:21:58 +0000649PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000650 PyObject *unicode /* Unicode object */
651 );
652
Mark Hammond91a681d2002-08-12 07:21:58 +0000653PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Guido van Rossumd8225182000-03-10 22:33:05 +0000654 const Py_UNICODE *data, /* Unicode char buffer */
655 int length, /* number of Py_UNICODE chars to encode */
656 const char *errors /* error handling */
657 );
658
659/* --- UTF-16 Codecs ------------------------------------------------------ */
660
Guido van Rossum9e896b32000-04-05 20:11:21 +0000661/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000662 the corresponding Unicode object.
663
664 errors (if non-NULL) defines the error handling. It defaults
665 to "strict".
666
667 If byteorder is non-NULL, the decoder starts decoding using the
668 given byte order:
669
670 *byteorder == -1: little endian
671 *byteorder == 0: native order
672 *byteorder == 1: big endian
673
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000674 In native mode, the first two bytes of the stream are checked for a
675 BOM mark. If found, the BOM mark is analysed, the byte order
676 adjusted and the BOM skipped. In the other modes, no BOM mark
677 interpretation is done. After completion, *byteorder is set to the
678 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000679
680 If byteorder is NULL, the codec starts in native order mode.
681
682*/
683
Mark Hammond91a681d2002-08-12 07:21:58 +0000684PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000685 const char *string, /* UTF-16 encoded string */
686 int length, /* size of string */
687 const char *errors, /* error handling */
688 int *byteorder /* pointer to byteorder to use
689 0=native;-1=LE,1=BE; updated on
690 exit */
691 );
692
693/* Returns a Python string using the UTF-16 encoding in native byte
694 order. The string always starts with a BOM mark. */
695
Mark Hammond91a681d2002-08-12 07:21:58 +0000696PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000697 PyObject *unicode /* Unicode object */
698 );
699
700/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000701 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000702
703 If byteorder is not 0, output is written according to the following
704 byte order:
705
706 byteorder == -1: little endian
707 byteorder == 0: native byte order (writes a BOM mark)
708 byteorder == 1: big endian
709
710 If byteorder is 0, the output string will always start with the
711 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
712 prepended.
713
714 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
715 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000716 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000717
718*/
719
Mark Hammond91a681d2002-08-12 07:21:58 +0000720PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Guido van Rossumd8225182000-03-10 22:33:05 +0000721 const Py_UNICODE *data, /* Unicode char buffer */
722 int length, /* number of Py_UNICODE chars to encode */
723 const char *errors, /* error handling */
724 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
725 );
726
727/* --- Unicode-Escape Codecs ---------------------------------------------- */
728
Mark Hammond91a681d2002-08-12 07:21:58 +0000729PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000730 const char *string, /* Unicode-Escape encoded string */
731 int length, /* size of string */
732 const char *errors /* error handling */
733 );
734
Mark Hammond91a681d2002-08-12 07:21:58 +0000735PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000736 PyObject *unicode /* Unicode object */
737 );
738
Mark Hammond91a681d2002-08-12 07:21:58 +0000739PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000740 const Py_UNICODE *data, /* Unicode char buffer */
741 int length /* Number of Py_UNICODE chars to encode */
742 );
743
744/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
745
Mark Hammond91a681d2002-08-12 07:21:58 +0000746PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000747 const char *string, /* Raw-Unicode-Escape encoded string */
748 int length, /* size of string */
749 const char *errors /* error handling */
750 );
751
Mark Hammond91a681d2002-08-12 07:21:58 +0000752PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000753 PyObject *unicode /* Unicode object */
754 );
755
Mark Hammond91a681d2002-08-12 07:21:58 +0000756PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Guido van Rossumd8225182000-03-10 22:33:05 +0000757 const Py_UNICODE *data, /* Unicode char buffer */
758 int length /* Number of Py_UNICODE chars to encode */
759 );
760
761/* --- Latin-1 Codecs -----------------------------------------------------
762
763 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
764
765*/
766
Mark Hammond91a681d2002-08-12 07:21:58 +0000767PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +0000768 const char *string, /* Latin-1 encoded string */
769 int length, /* size of string */
770 const char *errors /* error handling */
771 );
772
Mark Hammond91a681d2002-08-12 07:21:58 +0000773PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
Guido van Rossumd8225182000-03-10 22:33:05 +0000774 PyObject *unicode /* Unicode object */
775 );
776
Mark Hammond91a681d2002-08-12 07:21:58 +0000777PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Guido van Rossumd8225182000-03-10 22:33:05 +0000778 const Py_UNICODE *data, /* Unicode char buffer */
779 int length, /* Number of Py_UNICODE chars to encode */
780 const char *errors /* error handling */
781 );
782
783/* --- ASCII Codecs -------------------------------------------------------
784
785 Only 7-bit ASCII data is excepted. All other codes generate errors.
786
787*/
788
Mark Hammond91a681d2002-08-12 07:21:58 +0000789PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +0000790 const char *string, /* ASCII encoded string */
791 int length, /* size of string */
792 const char *errors /* error handling */
793 );
794
Mark Hammond91a681d2002-08-12 07:21:58 +0000795PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000796 PyObject *unicode /* Unicode object */
797 );
798
Mark Hammond91a681d2002-08-12 07:21:58 +0000799PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Guido van Rossumd8225182000-03-10 22:33:05 +0000800 const Py_UNICODE *data, /* Unicode char buffer */
801 int length, /* Number of Py_UNICODE chars to encode */
802 const char *errors /* error handling */
803 );
804
805/* --- Character Map Codecs -----------------------------------------------
806
807 This codec uses mappings to encode and decode characters.
808
809 Decoding mappings must map single string characters to single
810 Unicode characters, integers (which are then interpreted as Unicode
811 ordinals) or None (meaning "undefined mapping" and causing an
812 error).
813
814 Encoding mappings must map single Unicode characters to single
815 string characters, integers (which are then interpreted as Latin-1
816 ordinals) or None (meaning "undefined mapping" and causing an
817 error).
818
819 If a character lookup fails with a LookupError, the character is
820 copied as-is meaning that its ordinal value will be interpreted as
821 Unicode or Latin-1 ordinal resp. Because of this mappings only need
822 to contain those mappings which map characters to different code
823 points.
824
825*/
826
Mark Hammond91a681d2002-08-12 07:21:58 +0000827PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +0000828 const char *string, /* Encoded string */
829 int length, /* size of string */
830 PyObject *mapping, /* character mapping
831 (char ordinal -> unicode ordinal) */
832 const char *errors /* error handling */
833 );
834
Mark Hammond91a681d2002-08-12 07:21:58 +0000835PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
Guido van Rossumd8225182000-03-10 22:33:05 +0000836 PyObject *unicode, /* Unicode object */
837 PyObject *mapping /* character mapping
838 (unicode ordinal -> char ordinal) */
839 );
840
Mark Hammond91a681d2002-08-12 07:21:58 +0000841PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +0000842 const Py_UNICODE *data, /* Unicode char buffer */
843 int length, /* Number of Py_UNICODE chars to encode */
844 PyObject *mapping, /* character mapping
845 (unicode ordinal -> char ordinal) */
846 const char *errors /* error handling */
847 );
848
849/* Translate a Py_UNICODE buffer of the given length by applying a
850 character mapping table to it and return the resulting Unicode
851 object.
852
853 The mapping table must map Unicode ordinal integers to Unicode
854 ordinal integers or None (causing deletion of the character).
855
856 Mapping tables may be dictionaries or sequences. Unmapped character
857 ordinals (ones which cause a LookupError) are left untouched and
858 are copied as-is.
859
860*/
861
Mark Hammond91a681d2002-08-12 07:21:58 +0000862PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Guido van Rossumd8225182000-03-10 22:33:05 +0000863 const Py_UNICODE *data, /* Unicode char buffer */
864 int length, /* Number of Py_UNICODE chars to encode */
865 PyObject *table, /* Translate table */
866 const char *errors /* error handling */
867 );
868
Guido van Rossumefec1152000-03-28 02:01:15 +0000869#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000870
Guido van Rossumefec1152000-03-28 02:01:15 +0000871/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000872
Mark Hammond91a681d2002-08-12 07:21:58 +0000873PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000874 const char *string, /* MBCS encoded string */
875 int length, /* size of string */
876 const char *errors /* error handling */
877 );
878
Mark Hammond91a681d2002-08-12 07:21:58 +0000879PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
Guido van Rossumefec1152000-03-28 02:01:15 +0000880 PyObject *unicode /* Unicode object */
881 );
882
Mark Hammond91a681d2002-08-12 07:21:58 +0000883PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Guido van Rossumefec1152000-03-28 02:01:15 +0000884 const Py_UNICODE *data, /* Unicode char buffer */
885 int length, /* Number of Py_UNICODE chars to encode */
886 const char *errors /* error handling */
887 );
888
Guido van Rossumefec1152000-03-28 02:01:15 +0000889#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000890
Guido van Rossum9e896b32000-04-05 20:11:21 +0000891/* --- Decimal Encoder ---------------------------------------------------- */
892
893/* Takes a Unicode string holding a decimal value and writes it into
894 an output buffer using standard ASCII digit codes.
895
896 The output buffer has to provide at least length+1 bytes of storage
897 area. The output string is 0-terminated.
898
899 The encoder converts whitespace to ' ', decimal characters to their
900 corresponding ASCII digit and all other Latin-1 characters except
901 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
902 are treated as errors. This includes embedded NULL bytes.
903
904 Error handling is defined by the errors argument:
905
906 NULL or "strict": raise a ValueError
907 "ignore": ignore the wrong characters (these are not copied to the
908 output buffer)
909 "replace": replaces illegal characters with '?'
910
911 Returns 0 on success, -1 on failure.
912
913*/
914
Mark Hammond91a681d2002-08-12 07:21:58 +0000915PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Guido van Rossum9e896b32000-04-05 20:11:21 +0000916 Py_UNICODE *s, /* Unicode buffer */
917 int length, /* Number of Py_UNICODE chars to encode */
918 char *output, /* Output buffer; must have size >= length */
919 const char *errors /* error handling */
920 );
921
Guido van Rossumd8225182000-03-10 22:33:05 +0000922/* --- Methods & Slots ----------------------------------------------------
923
924 These are capable of handling Unicode objects and strings on input
925 (we refer to them as strings in the descriptions) and return
926 Unicode objects or integers as apporpriate. */
927
928/* Concat two strings giving a new Unicode string. */
929
Mark Hammond91a681d2002-08-12 07:21:58 +0000930PyAPI_FUNC(PyObject*) PyUnicode_Concat(
Guido van Rossumd8225182000-03-10 22:33:05 +0000931 PyObject *left, /* Left string */
932 PyObject *right /* Right string */
933 );
934
935/* Split a string giving a list of Unicode strings.
936
937 If sep is NULL, splitting will be done at all whitespace
938 substrings. Otherwise, splits occur at the given separator.
939
940 At most maxsplit splits will be done. If negative, no limit is set.
941
942 Separators are not included in the resulting list.
943
944*/
945
Mark Hammond91a681d2002-08-12 07:21:58 +0000946PyAPI_FUNC(PyObject*) PyUnicode_Split(
Guido van Rossumd8225182000-03-10 22:33:05 +0000947 PyObject *s, /* String to split */
948 PyObject *sep, /* String separator */
949 int maxsplit /* Maxsplit count */
950 );
951
952/* Dito, but split at line breaks.
953
954 CRLF is considered to be one line break. Line breaks are not
955 included in the resulting list. */
956
Mark Hammond91a681d2002-08-12 07:21:58 +0000957PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
Guido van Rossumd8225182000-03-10 22:33:05 +0000958 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000959 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000960 );
961
962/* Translate a string by applying a character mapping table to it and
963 return the resulting Unicode object.
964
965 The mapping table must map Unicode ordinal integers to Unicode
966 ordinal integers or None (causing deletion of the character).
967
968 Mapping tables may be dictionaries or sequences. Unmapped character
969 ordinals (ones which cause a LookupError) are left untouched and
970 are copied as-is.
971
972*/
973
Mark Hammond91a681d2002-08-12 07:21:58 +0000974PyAPI_FUNC(PyObject *) PyUnicode_Translate(
Guido van Rossumd8225182000-03-10 22:33:05 +0000975 PyObject *str, /* String */
976 PyObject *table, /* Translate table */
977 const char *errors /* error handling */
978 );
979
980/* Join a sequence of strings using the given separator and return
981 the resulting Unicode string. */
982
Mark Hammond91a681d2002-08-12 07:21:58 +0000983PyAPI_FUNC(PyObject*) PyUnicode_Join(
Guido van Rossumd8225182000-03-10 22:33:05 +0000984 PyObject *separator, /* Separator string */
985 PyObject *seq /* Sequence object */
986 );
987
988/* Return 1 if substr matches str[start:end] at the given tail end, 0
989 otherwise. */
990
Mark Hammond91a681d2002-08-12 07:21:58 +0000991PyAPI_FUNC(int) PyUnicode_Tailmatch(
Guido van Rossumd8225182000-03-10 22:33:05 +0000992 PyObject *str, /* String */
993 PyObject *substr, /* Prefix or Suffix string */
994 int start, /* Start index */
995 int end, /* Stop index */
996 int direction /* Tail end: -1 prefix, +1 suffix */
997 );
998
999/* Return the first position of substr in str[start:end] using the
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00001000 given search direction or -1 if not found. -2 is returned in case
1001 an error occurred and an exception is set. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001002
Mark Hammond91a681d2002-08-12 07:21:58 +00001003PyAPI_FUNC(int) PyUnicode_Find(
Guido van Rossumd8225182000-03-10 22:33:05 +00001004 PyObject *str, /* String */
1005 PyObject *substr, /* Substring to find */
1006 int start, /* Start index */
1007 int end, /* Stop index */
1008 int direction /* Find direction: +1 forward, -1 backward */
1009 );
1010
Barry Warsaw51ac5802000-03-20 16:36:48 +00001011/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +00001012
Mark Hammond91a681d2002-08-12 07:21:58 +00001013PyAPI_FUNC(int) PyUnicode_Count(
Guido van Rossumd8225182000-03-10 22:33:05 +00001014 PyObject *str, /* String */
1015 PyObject *substr, /* Substring to count */
1016 int start, /* Start index */
1017 int end /* Stop index */
1018 );
1019
Barry Warsaw51ac5802000-03-20 16:36:48 +00001020/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +00001021 and return the resulting Unicode object. */
1022
Mark Hammond91a681d2002-08-12 07:21:58 +00001023PyAPI_FUNC(PyObject *) PyUnicode_Replace(
Guido van Rossumd8225182000-03-10 22:33:05 +00001024 PyObject *str, /* String */
1025 PyObject *substr, /* Substring to find */
1026 PyObject *replstr, /* Substring to replace */
1027 int maxcount /* Max. number of replacements to apply;
1028 -1 = all */
1029 );
1030
1031/* Compare two strings and return -1, 0, 1 for less than, equal,
1032 greater than resp. */
1033
Mark Hammond91a681d2002-08-12 07:21:58 +00001034PyAPI_FUNC(int) PyUnicode_Compare(
Guido van Rossumd8225182000-03-10 22:33:05 +00001035 PyObject *left, /* Left string */
1036 PyObject *right /* Right string */
1037 );
1038
Thomas Wouters7e474022000-07-16 12:04:32 +00001039/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +00001040 the resulting Unicode string. */
1041
Mark Hammond91a681d2002-08-12 07:21:58 +00001042PyAPI_FUNC(PyObject *) PyUnicode_Format(
Guido van Rossumd8225182000-03-10 22:33:05 +00001043 PyObject *format, /* Format string */
1044 PyObject *args /* Argument tuple or dictionary */
1045 );
1046
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001047/* Checks whether element is contained in container and return 1/0
1048 accordingly.
1049
1050 element has to coerce to an one element Unicode string. -1 is
1051 returned in case of an error. */
1052
Mark Hammond91a681d2002-08-12 07:21:58 +00001053PyAPI_FUNC(int) PyUnicode_Contains(
Guido van Rossumd0d366b2000-03-13 23:22:24 +00001054 PyObject *container, /* Container string */
1055 PyObject *element /* Element string */
1056 );
1057
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001058/* Externally visible for str.strip(unicode) */
Mark Hammond91a681d2002-08-12 07:21:58 +00001059PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00001060 PyUnicodeObject *self,
1061 int striptype,
1062 PyObject *sepobj
1063 );
1064
Guido van Rossumd8225182000-03-10 22:33:05 +00001065/* === Characters Type APIs =============================================== */
1066
1067/* These should not be used directly. Use the Py_UNICODE_IS* and
1068 Py_UNICODE_TO* macros instead.
1069
1070 These APIs are implemented in Objects/unicodectype.c.
1071
1072*/
1073
Mark Hammond91a681d2002-08-12 07:21:58 +00001074PyAPI_FUNC(int) _PyUnicode_IsLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001075 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001076 );
1077
Mark Hammond91a681d2002-08-12 07:21:58 +00001078PyAPI_FUNC(int) _PyUnicode_IsUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001079 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001080 );
1081
Mark Hammond91a681d2002-08-12 07:21:58 +00001082PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001083 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001084 );
1085
Mark Hammond91a681d2002-08-12 07:21:58 +00001086PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001087 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001088 );
1089
Mark Hammond91a681d2002-08-12 07:21:58 +00001090PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001091 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001092 );
1093
Mark Hammond91a681d2002-08-12 07:21:58 +00001094PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001095 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001096 );
1097
Mark Hammond91a681d2002-08-12 07:21:58 +00001098PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001099 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001100 );
1101
Mark Hammond91a681d2002-08-12 07:21:58 +00001102PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001103 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001104 );
1105
Mark Hammond91a681d2002-08-12 07:21:58 +00001106PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001107 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001108 );
1109
Mark Hammond91a681d2002-08-12 07:21:58 +00001110PyAPI_FUNC(int) _PyUnicode_ToDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001111 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001112 );
1113
Mark Hammond91a681d2002-08-12 07:21:58 +00001114PyAPI_FUNC(double) _PyUnicode_ToNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001115 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001116 );
1117
Mark Hammond91a681d2002-08-12 07:21:58 +00001118PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001119 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001120 );
1121
Mark Hammond91a681d2002-08-12 07:21:58 +00001122PyAPI_FUNC(int) _PyUnicode_IsDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001123 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001124 );
1125
Mark Hammond91a681d2002-08-12 07:21:58 +00001126PyAPI_FUNC(int) _PyUnicode_IsNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001127 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +00001128 );
1129
Mark Hammond91a681d2002-08-12 07:21:58 +00001130PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Fredrik Lundh72b06852001-06-27 22:08:26 +00001131 Py_UNICODE ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +00001132 );
1133
Guido van Rossumd8225182000-03-10 22:33:05 +00001134#ifdef __cplusplus
1135}
1136#endif
Martin v. Löwis339d0f72001-08-17 18:39:25 +00001137#endif /* Py_USING_UNICODE */
Guido van Rossumd8225182000-03-10 22:33:05 +00001138#endif /* !Py_UNICODEOBJECT_H */