blob: 871fbed1d80cf6b77bafc1374e9944f6025e0498 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000061/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
62 properly set, but the default rules below doesn't set it. I'll
63 sort this out some other day -- fredrik@pythonware.com */
64
65#ifndef Py_UNICODE_SIZE
66#error Must define Py_UNICODE_SIZE
67#endif
68
Fredrik Lundh8f455852001-06-27 18:59:43 +000069/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
70 strings are stored as UCS-2 (with limited support for UTF-16) */
71
72#if Py_UNICODE_SIZE >= 4
73#define Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000074#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000075
Guido van Rossumd8225182000-03-10 22:33:05 +000076/* Set these flags if the platform has "wchar.h", "wctype.h" and the
77 wchar_t type is a 16-bit unsigned type */
78/* #define HAVE_WCHAR_H */
79/* #define HAVE_USABLE_WCHAR_T */
80
81/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000082#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000083
Fredrik Lundh1294ad02001-06-26 17:17:07 +000084/* Windows has a usable wchar_t type (unless we're using UCS-4) */
Fredrik Lundh8f455852001-06-27 18:59:43 +000085# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
Guido van Rossumd8225182000-03-10 22:33:05 +000086# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000087# define PY_UNICODE_TYPE wchar_t
88# endif
89
Fredrik Lundh8f455852001-06-27 18:59:43 +000090# if defined(Py_UNICODE_WIDE)
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000091# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000092# endif
93
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97 through the interface functions PyUnicode_FromWideChar() and
98 PyUnicode_AsWideChar(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000101# ifndef HAVE_WCHAR_H
102# define HAVE_WCHAR_H
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104#endif
105
106#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109# include <time.h>
110# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000111# include "wchar.h"
112#endif
113
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000114/*
115 * Use this typedef when you need to represent a UTF-16 surrogate pair
116 * as single unsigned integer.
117 */
118#if SIZEOF_INT >= 4
119typedef unsigned int Py_UCS4;
120#elif SIZEOF_LONG >= 4
121typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000122#endif
123
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000124#if SIZEOF_SHORT == 2
125typedef unsigned short Py_UCS2;
126#else
127#error Cannot find a two-byte type
128#endif
Marc-André Lemburg43279102000-07-07 09:01:41 +0000129
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000130typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000131
Guido van Rossumd8225182000-03-10 22:33:05 +0000132/* --- Internal Unicode Operations ---------------------------------------- */
133
134/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000135 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
136 configure Python using --with-ctype-functions. This reduces the
137 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000138
139#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
140
141#include "wctype.h"
142
143#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
144
145#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
146#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
147#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
148#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
149
150#define Py_UNICODE_TOLOWER(ch) towlower(ch)
151#define Py_UNICODE_TOUPPER(ch) towupper(ch)
152#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
153
154#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
155#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
156#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
157
158#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
159#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
160#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
161
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000162#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
163
Guido van Rossumd8225182000-03-10 22:33:05 +0000164#else
165
166#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
167
168#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
169#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
170#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
171#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
172
173#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
174#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
175#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
176
177#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
178#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
179#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
180
181#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
182#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
183#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
184
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000185#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000186
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000187#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000188
189#define Py_UNICODE_ISALNUM(ch) \
190 (Py_UNICODE_ISALPHA(ch) || \
191 Py_UNICODE_ISDECIMAL(ch) || \
192 Py_UNICODE_ISDIGIT(ch) || \
193 Py_UNICODE_ISNUMERIC(ch))
194
Guido van Rossumd8225182000-03-10 22:33:05 +0000195#define Py_UNICODE_COPY(target, source, length)\
196 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
197
198#define Py_UNICODE_FILL(target, value, length) do\
199 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
200 while (0)
201
202#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000203 ((*((string)->str + (offset)) == *((substring)->str)) &&\
204 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000205 (substring)->length*sizeof(Py_UNICODE)))
206
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207#ifdef __cplusplus
208extern "C" {
209#endif
210
Guido van Rossumd8225182000-03-10 22:33:05 +0000211/* --- Unicode Type ------------------------------------------------------- */
212
213typedef struct {
214 PyObject_HEAD
215 int length; /* Length of raw Unicode data in buffer */
216 Py_UNICODE *str; /* Raw Unicode buffer */
217 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000218 PyObject *defenc; /* (Default) Encoded version as Python
219 string, or NULL; this is used for
220 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000221} PyUnicodeObject;
222
223extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
224
225#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
226
227/* Fast access macros */
228#define PyUnicode_GET_SIZE(op) \
229 (((PyUnicodeObject *)(op))->length)
230#define PyUnicode_GET_DATA_SIZE(op) \
231 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
232#define PyUnicode_AS_UNICODE(op) \
233 (((PyUnicodeObject *)(op))->str)
234#define PyUnicode_AS_DATA(op) \
235 ((const char *)((PyUnicodeObject *)(op))->str)
236
237/* --- Constants ---------------------------------------------------------- */
238
239/* This Unicode character will be used as replacement character during
240 decoding if the errors argument is set to "replace". Note: the
241 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
242 Unicode 3.0. */
243
244#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
245
246/* === Public API ========================================================= */
247
248/* --- Plain Py_UNICODE --------------------------------------------------- */
249
250/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000251 size.
252
253 u may be NULL which causes the contents to be undefined. It is the
254 user's responsibility to fill in the needed data afterwards. Note
255 that modifying the Unicode object contents after construction is
256 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000257
258 The buffer is copied into the new object. */
259
260extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
261 const Py_UNICODE *u, /* Unicode buffer */
262 int size /* size of buffer */
263 );
264
265/* Return a read-only pointer to the Unicode object's internal
266 Py_UNICODE buffer. */
267
268extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
269 PyObject *unicode /* Unicode object */
270 );
271
272/* Get the length of the Unicode object. */
273
274extern DL_IMPORT(int) PyUnicode_GetSize(
275 PyObject *unicode /* Unicode object */
276 );
277
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278/* Get the maximum ordinal for a Unicode character. */
279extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
280
Guido van Rossum52c23592000-04-10 13:41:41 +0000281/* Resize an already allocated Unicode object to the new size length.
282
283 *unicode is modified to point to the new (resized) object and 0
284 returned on success.
285
286 This API may only be called by the function which also called the
287 Unicode constructor. The refcount on the object must be 1. Otherwise,
288 an error is returned.
289
290 Error handling is implemented as follows: an exception is set, -1
291 is returned and *unicode left untouched.
292
293*/
294
295extern DL_IMPORT(int) PyUnicode_Resize(
296 PyObject **unicode, /* Pointer to the Unicode object */
297 int length /* New length */
298 );
299
Guido van Rossumd8225182000-03-10 22:33:05 +0000300/* Coerce obj to an Unicode object and return a reference with
301 *incremented* refcount.
302
303 Coercion is done in the following way:
304
305 1. Unicode objects are passed back as-is with incremented
306 refcount.
307
308 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000309 under the assumptions that they contain data using the current
310 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000311
312 3. All other objects raise an exception.
313
314 The API returns NULL in case of an error. The caller is responsible
315 for decref'ing the returned objects.
316
317*/
318
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000319extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
320 register PyObject *obj, /* Object */
321 const char *encoding, /* encoding */
322 const char *errors /* error handling */
323 );
324
325/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
326 which results in using the default encoding as basis for
327 decoding the object.
328
329 Coerces obj to an Unicode object and return a reference with
330 *incremented* refcount.
331
332 The API returns NULL in case of an error. The caller is responsible
333 for decref'ing the returned objects.
334
335*/
336
Guido van Rossumd8225182000-03-10 22:33:05 +0000337extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
338 register PyObject *obj /* Object */
339 );
340
341/* --- wchar_t support for platforms which support it --------------------- */
342
343#ifdef HAVE_WCHAR_H
344
345/* Create a Unicode Object from the whcar_t buffer w of the given
346 size.
347
348 The buffer is copied into the new object. */
349
350extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
351 register const wchar_t *w, /* wchar_t buffer */
352 int size /* size of buffer */
353 );
354
355/* Copies the Unicode Object contents into the whcar_t buffer w. At
356 most size wchar_t characters are copied.
357
358 Returns the number of wchar_t characters copied or -1 in case of an
359 error. */
360
361extern DL_IMPORT(int) PyUnicode_AsWideChar(
362 PyUnicodeObject *unicode, /* Unicode object */
363 register wchar_t *w, /* wchar_t buffer */
364 int size /* size of buffer */
365 );
366
367#endif
368
369/* === Builtin Codecs =====================================================
370
371 Many of these APIs take two arguments encoding and errors. These
372 parameters encoding and errors have the same semantics as the ones
373 of the builtin unicode() API.
374
Fred Drakecb093fe2000-05-09 19:51:53 +0000375 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000376
377 Error handling is set by errors which may also be set to NULL
378 meaning to use the default handling defined for the codec. Default
379 error handling for all builtin codecs is "strict" (ValueErrors are
380 raised).
381
382 The codecs all use a similar interface. Only deviation from the
383 generic ones are documented.
384
385*/
386
Fred Drakecb093fe2000-05-09 19:51:53 +0000387/* --- Manage the default encoding ---------------------------------------- */
388
389/* Returns the currently active default encoding.
390
391 The default encoding is currently implemented as run-time settable
392 process global. This may change in future versions of the
393 interpreter to become a parameter which is managed on a per-thread
394 basis.
395
396 */
397
Thomas Wouters5f375912000-07-22 23:30:03 +0000398extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000399
400/* Sets the currently active default encoding.
401
402 Returns 0 on success, -1 in case of an error.
403
404 */
405
406extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
407 const char *encoding /* Encoding name in standard form */
408 );
409
Guido van Rossumd8225182000-03-10 22:33:05 +0000410/* --- Generic Codecs ----------------------------------------------------- */
411
412/* Create a Unicode object by decoding the encoded string s of the
413 given size. */
414
415extern DL_IMPORT(PyObject*) PyUnicode_Decode(
416 const char *s, /* encoded string */
417 int size, /* size of buffer */
418 const char *encoding, /* encoding */
419 const char *errors /* error handling */
420 );
421
422/* Encodes a Py_UNICODE buffer of the given size and returns a
423 Python string object. */
424
425extern DL_IMPORT(PyObject*) PyUnicode_Encode(
426 const Py_UNICODE *s, /* Unicode char buffer */
427 int size, /* number of Py_UNICODE chars to encode */
428 const char *encoding, /* encoding */
429 const char *errors /* error handling */
430 );
431
432/* Encodes a Unicode object and returns the result as Python string
433 object. */
434
435extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
436 PyObject *unicode, /* Unicode object */
437 const char *encoding, /* encoding */
438 const char *errors /* error handling */
439 );
440
441/* --- UTF-8 Codecs ------------------------------------------------------- */
442
443extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
444 const char *string, /* UTF-8 encoded string */
445 int length, /* size of string */
446 const char *errors /* error handling */
447 );
448
449extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
450 PyObject *unicode /* Unicode object */
451 );
452
453extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
454 const Py_UNICODE *data, /* Unicode char buffer */
455 int length, /* number of Py_UNICODE chars to encode */
456 const char *errors /* error handling */
457 );
458
459/* --- UTF-16 Codecs ------------------------------------------------------ */
460
Guido van Rossum9e896b32000-04-05 20:11:21 +0000461/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000462 the corresponding Unicode object.
463
464 errors (if non-NULL) defines the error handling. It defaults
465 to "strict".
466
467 If byteorder is non-NULL, the decoder starts decoding using the
468 given byte order:
469
470 *byteorder == -1: little endian
471 *byteorder == 0: native order
472 *byteorder == 1: big endian
473
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000474 In native mode, the first two bytes of the stream are checked for a
475 BOM mark. If found, the BOM mark is analysed, the byte order
476 adjusted and the BOM skipped. In the other modes, no BOM mark
477 interpretation is done. After completion, *byteorder is set to the
478 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000479
480 If byteorder is NULL, the codec starts in native order mode.
481
482*/
483
484extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
485 const char *string, /* UTF-16 encoded string */
486 int length, /* size of string */
487 const char *errors, /* error handling */
488 int *byteorder /* pointer to byteorder to use
489 0=native;-1=LE,1=BE; updated on
490 exit */
491 );
492
493/* Returns a Python string using the UTF-16 encoding in native byte
494 order. The string always starts with a BOM mark. */
495
496extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
497 PyObject *unicode /* Unicode object */
498 );
499
500/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000501 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000502
503 If byteorder is not 0, output is written according to the following
504 byte order:
505
506 byteorder == -1: little endian
507 byteorder == 0: native byte order (writes a BOM mark)
508 byteorder == 1: big endian
509
510 If byteorder is 0, the output string will always start with the
511 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
512 prepended.
513
514 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
515 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000516 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000517
518*/
519
520extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
521 const Py_UNICODE *data, /* Unicode char buffer */
522 int length, /* number of Py_UNICODE chars to encode */
523 const char *errors, /* error handling */
524 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
525 );
526
527/* --- Unicode-Escape Codecs ---------------------------------------------- */
528
529extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
530 const char *string, /* Unicode-Escape encoded string */
531 int length, /* size of string */
532 const char *errors /* error handling */
533 );
534
535extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
536 PyObject *unicode /* Unicode object */
537 );
538
539extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
540 const Py_UNICODE *data, /* Unicode char buffer */
541 int length /* Number of Py_UNICODE chars to encode */
542 );
543
544/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
545
546extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
547 const char *string, /* Raw-Unicode-Escape encoded string */
548 int length, /* size of string */
549 const char *errors /* error handling */
550 );
551
552extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
553 PyObject *unicode /* Unicode object */
554 );
555
556extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
557 const Py_UNICODE *data, /* Unicode char buffer */
558 int length /* Number of Py_UNICODE chars to encode */
559 );
560
561/* --- Latin-1 Codecs -----------------------------------------------------
562
563 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
564
565*/
566
567extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
568 const char *string, /* Latin-1 encoded string */
569 int length, /* size of string */
570 const char *errors /* error handling */
571 );
572
573extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
574 PyObject *unicode /* Unicode object */
575 );
576
577extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
578 const Py_UNICODE *data, /* Unicode char buffer */
579 int length, /* Number of Py_UNICODE chars to encode */
580 const char *errors /* error handling */
581 );
582
583/* --- ASCII Codecs -------------------------------------------------------
584
585 Only 7-bit ASCII data is excepted. All other codes generate errors.
586
587*/
588
589extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
590 const char *string, /* ASCII encoded string */
591 int length, /* size of string */
592 const char *errors /* error handling */
593 );
594
595extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
596 PyObject *unicode /* Unicode object */
597 );
598
599extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
600 const Py_UNICODE *data, /* Unicode char buffer */
601 int length, /* Number of Py_UNICODE chars to encode */
602 const char *errors /* error handling */
603 );
604
605/* --- Character Map Codecs -----------------------------------------------
606
607 This codec uses mappings to encode and decode characters.
608
609 Decoding mappings must map single string characters to single
610 Unicode characters, integers (which are then interpreted as Unicode
611 ordinals) or None (meaning "undefined mapping" and causing an
612 error).
613
614 Encoding mappings must map single Unicode characters to single
615 string characters, integers (which are then interpreted as Latin-1
616 ordinals) or None (meaning "undefined mapping" and causing an
617 error).
618
619 If a character lookup fails with a LookupError, the character is
620 copied as-is meaning that its ordinal value will be interpreted as
621 Unicode or Latin-1 ordinal resp. Because of this mappings only need
622 to contain those mappings which map characters to different code
623 points.
624
625*/
626
627extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
628 const char *string, /* Encoded string */
629 int length, /* size of string */
630 PyObject *mapping, /* character mapping
631 (char ordinal -> unicode ordinal) */
632 const char *errors /* error handling */
633 );
634
635extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
636 PyObject *unicode, /* Unicode object */
637 PyObject *mapping /* character mapping
638 (unicode ordinal -> char ordinal) */
639 );
640
641extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
642 const Py_UNICODE *data, /* Unicode char buffer */
643 int length, /* Number of Py_UNICODE chars to encode */
644 PyObject *mapping, /* character mapping
645 (unicode ordinal -> char ordinal) */
646 const char *errors /* error handling */
647 );
648
649/* Translate a Py_UNICODE buffer of the given length by applying a
650 character mapping table to it and return the resulting Unicode
651 object.
652
653 The mapping table must map Unicode ordinal integers to Unicode
654 ordinal integers or None (causing deletion of the character).
655
656 Mapping tables may be dictionaries or sequences. Unmapped character
657 ordinals (ones which cause a LookupError) are left untouched and
658 are copied as-is.
659
660*/
661
662extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
663 const Py_UNICODE *data, /* Unicode char buffer */
664 int length, /* Number of Py_UNICODE chars to encode */
665 PyObject *table, /* Translate table */
666 const char *errors /* error handling */
667 );
668
Guido van Rossumefec1152000-03-28 02:01:15 +0000669#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000670
Guido van Rossumefec1152000-03-28 02:01:15 +0000671/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000672
Guido van Rossumefec1152000-03-28 02:01:15 +0000673extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
674 const char *string, /* MBCS encoded string */
675 int length, /* size of string */
676 const char *errors /* error handling */
677 );
678
679extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
680 PyObject *unicode /* Unicode object */
681 );
682
683extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
684 const Py_UNICODE *data, /* Unicode char buffer */
685 int length, /* Number of Py_UNICODE chars to encode */
686 const char *errors /* error handling */
687 );
688
Guido van Rossumefec1152000-03-28 02:01:15 +0000689#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000690
Guido van Rossum9e896b32000-04-05 20:11:21 +0000691/* --- Decimal Encoder ---------------------------------------------------- */
692
693/* Takes a Unicode string holding a decimal value and writes it into
694 an output buffer using standard ASCII digit codes.
695
696 The output buffer has to provide at least length+1 bytes of storage
697 area. The output string is 0-terminated.
698
699 The encoder converts whitespace to ' ', decimal characters to their
700 corresponding ASCII digit and all other Latin-1 characters except
701 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
702 are treated as errors. This includes embedded NULL bytes.
703
704 Error handling is defined by the errors argument:
705
706 NULL or "strict": raise a ValueError
707 "ignore": ignore the wrong characters (these are not copied to the
708 output buffer)
709 "replace": replaces illegal characters with '?'
710
711 Returns 0 on success, -1 on failure.
712
713*/
714
715extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
716 Py_UNICODE *s, /* Unicode buffer */
717 int length, /* Number of Py_UNICODE chars to encode */
718 char *output, /* Output buffer; must have size >= length */
719 const char *errors /* error handling */
720 );
721
Guido van Rossumd8225182000-03-10 22:33:05 +0000722/* --- Methods & Slots ----------------------------------------------------
723
724 These are capable of handling Unicode objects and strings on input
725 (we refer to them as strings in the descriptions) and return
726 Unicode objects or integers as apporpriate. */
727
728/* Concat two strings giving a new Unicode string. */
729
730extern DL_IMPORT(PyObject*) PyUnicode_Concat(
731 PyObject *left, /* Left string */
732 PyObject *right /* Right string */
733 );
734
735/* Split a string giving a list of Unicode strings.
736
737 If sep is NULL, splitting will be done at all whitespace
738 substrings. Otherwise, splits occur at the given separator.
739
740 At most maxsplit splits will be done. If negative, no limit is set.
741
742 Separators are not included in the resulting list.
743
744*/
745
746extern DL_IMPORT(PyObject*) PyUnicode_Split(
747 PyObject *s, /* String to split */
748 PyObject *sep, /* String separator */
749 int maxsplit /* Maxsplit count */
750 );
751
752/* Dito, but split at line breaks.
753
754 CRLF is considered to be one line break. Line breaks are not
755 included in the resulting list. */
756
757extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
758 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000759 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000760 );
761
762/* Translate a string by applying a character mapping table to it and
763 return the resulting Unicode object.
764
765 The mapping table must map Unicode ordinal integers to Unicode
766 ordinal integers or None (causing deletion of the character).
767
768 Mapping tables may be dictionaries or sequences. Unmapped character
769 ordinals (ones which cause a LookupError) are left untouched and
770 are copied as-is.
771
772*/
773
774extern DL_IMPORT(PyObject *) PyUnicode_Translate(
775 PyObject *str, /* String */
776 PyObject *table, /* Translate table */
777 const char *errors /* error handling */
778 );
779
780/* Join a sequence of strings using the given separator and return
781 the resulting Unicode string. */
782
783extern DL_IMPORT(PyObject*) PyUnicode_Join(
784 PyObject *separator, /* Separator string */
785 PyObject *seq /* Sequence object */
786 );
787
788/* Return 1 if substr matches str[start:end] at the given tail end, 0
789 otherwise. */
790
791extern DL_IMPORT(int) PyUnicode_Tailmatch(
792 PyObject *str, /* String */
793 PyObject *substr, /* Prefix or Suffix string */
794 int start, /* Start index */
795 int end, /* Stop index */
796 int direction /* Tail end: -1 prefix, +1 suffix */
797 );
798
799/* Return the first position of substr in str[start:end] using the
800 given search direction or -1 if not found. */
801
802extern DL_IMPORT(int) PyUnicode_Find(
803 PyObject *str, /* String */
804 PyObject *substr, /* Substring to find */
805 int start, /* Start index */
806 int end, /* Stop index */
807 int direction /* Find direction: +1 forward, -1 backward */
808 );
809
Barry Warsaw51ac5802000-03-20 16:36:48 +0000810/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000811
812extern DL_IMPORT(int) PyUnicode_Count(
813 PyObject *str, /* String */
814 PyObject *substr, /* Substring to count */
815 int start, /* Start index */
816 int end /* Stop index */
817 );
818
Barry Warsaw51ac5802000-03-20 16:36:48 +0000819/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000820 and return the resulting Unicode object. */
821
822extern DL_IMPORT(PyObject *) PyUnicode_Replace(
823 PyObject *str, /* String */
824 PyObject *substr, /* Substring to find */
825 PyObject *replstr, /* Substring to replace */
826 int maxcount /* Max. number of replacements to apply;
827 -1 = all */
828 );
829
830/* Compare two strings and return -1, 0, 1 for less than, equal,
831 greater than resp. */
832
833extern DL_IMPORT(int) PyUnicode_Compare(
834 PyObject *left, /* Left string */
835 PyObject *right /* Right string */
836 );
837
Thomas Wouters7e474022000-07-16 12:04:32 +0000838/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000839 the resulting Unicode string. */
840
841extern DL_IMPORT(PyObject *) PyUnicode_Format(
842 PyObject *format, /* Format string */
843 PyObject *args /* Argument tuple or dictionary */
844 );
845
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000846/* Checks whether element is contained in container and return 1/0
847 accordingly.
848
849 element has to coerce to an one element Unicode string. -1 is
850 returned in case of an error. */
851
852extern DL_IMPORT(int) PyUnicode_Contains(
853 PyObject *container, /* Container string */
854 PyObject *element /* Element string */
855 );
856
Guido van Rossumd8225182000-03-10 22:33:05 +0000857/* === Characters Type APIs =============================================== */
858
859/* These should not be used directly. Use the Py_UNICODE_IS* and
860 Py_UNICODE_TO* macros instead.
861
862 These APIs are implemented in Objects/unicodectype.c.
863
864*/
865
866extern DL_IMPORT(int) _PyUnicode_IsLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000867 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000868 );
869
870extern DL_IMPORT(int) _PyUnicode_IsUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000871 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000872 );
873
874extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000875 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000876 );
877
878extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000879 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000880 );
881
882extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000883 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000884 );
885
886extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000887 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000888 );
889
890extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000891 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000892 );
893
894extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000895 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000896 );
897
898extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000899 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000900 );
901
902extern DL_IMPORT(int) _PyUnicode_ToDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000903 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000904 );
905
906extern DL_IMPORT(double) _PyUnicode_ToNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000907 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000908 );
909
910extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000911 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000912 );
913
914extern DL_IMPORT(int) _PyUnicode_IsDigit(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000915 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000916 );
917
918extern DL_IMPORT(int) _PyUnicode_IsNumeric(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000919 Py_UNICODE ch /* Unicode character */
Guido van Rossumd8225182000-03-10 22:33:05 +0000920 );
921
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000922extern DL_IMPORT(int) _PyUnicode_IsAlpha(
Fredrik Lundh72b06852001-06-27 22:08:26 +0000923 Py_UNICODE ch /* Unicode character */
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000924 );
925
Guido van Rossumd8225182000-03-10 22:33:05 +0000926#ifdef __cplusplus
927}
928#endif
929#endif /* !Py_UNICODEOBJECT_H */