blob: e330fd1bbe5d0274178ef3d0c307159c129d66a4 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Fredrik Lundh1294ad02001-06-26 17:17:07 +000061/* experimental UCS-4 support. enable at your own risk! */
62#undef USE_UCS4_STORAGE
63
64/*
65 * Use this typedef when you need to represent a UTF-16 surrogate pair
66 * as single unsigned integer.
67 */
68#if SIZEOF_INT >= 4
69typedef unsigned int Py_UCS4;
70#elif SIZEOF_LONG >= 4
71typedef unsigned long Py_UCS4;
72#endif
73
Guido van Rossumd8225182000-03-10 22:33:05 +000074/* Set these flags if the platform has "wchar.h", "wctype.h" and the
75 wchar_t type is a 16-bit unsigned type */
76/* #define HAVE_WCHAR_H */
77/* #define HAVE_USABLE_WCHAR_T */
78
79/* Defaults for various platforms */
80#ifndef HAVE_USABLE_WCHAR_T
81
Fredrik Lundh1294ad02001-06-26 17:17:07 +000082/* Windows has a usable wchar_t type (unless we're using UCS-4) */
83# if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
Guido van Rossumd8225182000-03-10 22:33:05 +000084# define HAVE_USABLE_WCHAR_T
85# endif
86
87#endif
88
89/* If the compiler provides a wchar_t type we try to support it
90 through the interface functions PyUnicode_FromWideChar() and
91 PyUnicode_AsWideChar(). */
92
93#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000094# ifndef HAVE_WCHAR_H
95# define HAVE_WCHAR_H
96# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000097#endif
98
99#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000100/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
101# ifdef _HAVE_BSDI
102# include <time.h>
103# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000104# include "wchar.h"
105#endif
106
107#ifdef HAVE_USABLE_WCHAR_T
108
109/* If the compiler defines whcar_t as a 16-bit unsigned type we can
110 use the compiler type directly. Works fine with all modern Windows
111 platforms. */
112
113typedef wchar_t Py_UNICODE;
114
115#else
116
117/* Use if you have a standard ANSI compiler, without wchar_t support.
118 If a short is not 16 bits on your platform, you have to fix the
119 typedef below, or the module initialization code will complain. */
120
Fredrik Lundh1294ad02001-06-26 17:17:07 +0000121#ifdef USE_UCS4_STORAGE
122typedef Py_UCS4 Py_UNICODE;
123#else
Guido van Rossumd8225182000-03-10 22:33:05 +0000124typedef unsigned short Py_UNICODE;
Guido van Rossumd8225182000-03-10 22:33:05 +0000125#endif
126
Fredrik Lundh1294ad02001-06-26 17:17:07 +0000127#endif
Marc-André Lemburg43279102000-07-07 09:01:41 +0000128
129
Guido van Rossumd8225182000-03-10 22:33:05 +0000130/* --- Internal Unicode Operations ---------------------------------------- */
131
132/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000133 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
134 configure Python using --with-ctype-functions. This reduces the
135 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000136
137#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
138
139#include "wctype.h"
140
141#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
142
143#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
144#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) towlower(ch)
149#define Py_UNICODE_TOUPPER(ch) towupper(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
155
156#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
157#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
158#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
159
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000160#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
161
Guido van Rossumd8225182000-03-10 22:33:05 +0000162#else
163
164#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
165
166#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
167#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
168#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
169#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
170
171#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
172#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
173#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
174
175#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
176#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
177#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
178
179#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
180#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
181#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
182
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000183#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000184
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000185#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000186
187#define Py_UNICODE_ISALNUM(ch) \
188 (Py_UNICODE_ISALPHA(ch) || \
189 Py_UNICODE_ISDECIMAL(ch) || \
190 Py_UNICODE_ISDIGIT(ch) || \
191 Py_UNICODE_ISNUMERIC(ch))
192
Guido van Rossumd8225182000-03-10 22:33:05 +0000193#define Py_UNICODE_COPY(target, source, length)\
194 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
195
196#define Py_UNICODE_FILL(target, value, length) do\
197 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
198 while (0)
199
200#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000201 ((*((string)->str + (offset)) == *((substring)->str)) &&\
202 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000203 (substring)->length*sizeof(Py_UNICODE)))
204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205#ifdef __cplusplus
206extern "C" {
207#endif
208
Guido van Rossumd8225182000-03-10 22:33:05 +0000209/* --- Unicode Type ------------------------------------------------------- */
210
211typedef struct {
212 PyObject_HEAD
213 int length; /* Length of raw Unicode data in buffer */
214 Py_UNICODE *str; /* Raw Unicode buffer */
215 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000216 PyObject *defenc; /* (Default) Encoded version as Python
217 string, or NULL; this is used for
218 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000219} PyUnicodeObject;
220
221extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
222
223#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
224
225/* Fast access macros */
226#define PyUnicode_GET_SIZE(op) \
227 (((PyUnicodeObject *)(op))->length)
228#define PyUnicode_GET_DATA_SIZE(op) \
229 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
230#define PyUnicode_AS_UNICODE(op) \
231 (((PyUnicodeObject *)(op))->str)
232#define PyUnicode_AS_DATA(op) \
233 ((const char *)((PyUnicodeObject *)(op))->str)
234
235/* --- Constants ---------------------------------------------------------- */
236
237/* This Unicode character will be used as replacement character during
238 decoding if the errors argument is set to "replace". Note: the
239 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
240 Unicode 3.0. */
241
242#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
243
244/* === Public API ========================================================= */
245
246/* --- Plain Py_UNICODE --------------------------------------------------- */
247
248/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000249 size.
250
251 u may be NULL which causes the contents to be undefined. It is the
252 user's responsibility to fill in the needed data afterwards. Note
253 that modifying the Unicode object contents after construction is
254 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000255
256 The buffer is copied into the new object. */
257
258extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
259 const Py_UNICODE *u, /* Unicode buffer */
260 int size /* size of buffer */
261 );
262
263/* Return a read-only pointer to the Unicode object's internal
264 Py_UNICODE buffer. */
265
266extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
267 PyObject *unicode /* Unicode object */
268 );
269
270/* Get the length of the Unicode object. */
271
272extern DL_IMPORT(int) PyUnicode_GetSize(
273 PyObject *unicode /* Unicode object */
274 );
275
Guido van Rossum52c23592000-04-10 13:41:41 +0000276/* Resize an already allocated Unicode object to the new size length.
277
278 *unicode is modified to point to the new (resized) object and 0
279 returned on success.
280
281 This API may only be called by the function which also called the
282 Unicode constructor. The refcount on the object must be 1. Otherwise,
283 an error is returned.
284
285 Error handling is implemented as follows: an exception is set, -1
286 is returned and *unicode left untouched.
287
288*/
289
290extern DL_IMPORT(int) PyUnicode_Resize(
291 PyObject **unicode, /* Pointer to the Unicode object */
292 int length /* New length */
293 );
294
Guido van Rossumd8225182000-03-10 22:33:05 +0000295/* Coerce obj to an Unicode object and return a reference with
296 *incremented* refcount.
297
298 Coercion is done in the following way:
299
300 1. Unicode objects are passed back as-is with incremented
301 refcount.
302
303 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000304 under the assumptions that they contain data using the current
305 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000306
307 3. All other objects raise an exception.
308
309 The API returns NULL in case of an error. The caller is responsible
310 for decref'ing the returned objects.
311
312*/
313
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000314extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
315 register PyObject *obj, /* Object */
316 const char *encoding, /* encoding */
317 const char *errors /* error handling */
318 );
319
320/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
321 which results in using the default encoding as basis for
322 decoding the object.
323
324 Coerces obj to an Unicode object and return a reference with
325 *incremented* refcount.
326
327 The API returns NULL in case of an error. The caller is responsible
328 for decref'ing the returned objects.
329
330*/
331
Guido van Rossumd8225182000-03-10 22:33:05 +0000332extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
333 register PyObject *obj /* Object */
334 );
335
336/* --- wchar_t support for platforms which support it --------------------- */
337
338#ifdef HAVE_WCHAR_H
339
340/* Create a Unicode Object from the whcar_t buffer w of the given
341 size.
342
343 The buffer is copied into the new object. */
344
345extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
346 register const wchar_t *w, /* wchar_t buffer */
347 int size /* size of buffer */
348 );
349
350/* Copies the Unicode Object contents into the whcar_t buffer w. At
351 most size wchar_t characters are copied.
352
353 Returns the number of wchar_t characters copied or -1 in case of an
354 error. */
355
356extern DL_IMPORT(int) PyUnicode_AsWideChar(
357 PyUnicodeObject *unicode, /* Unicode object */
358 register wchar_t *w, /* wchar_t buffer */
359 int size /* size of buffer */
360 );
361
362#endif
363
364/* === Builtin Codecs =====================================================
365
366 Many of these APIs take two arguments encoding and errors. These
367 parameters encoding and errors have the same semantics as the ones
368 of the builtin unicode() API.
369
Fred Drakecb093fe2000-05-09 19:51:53 +0000370 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000371
372 Error handling is set by errors which may also be set to NULL
373 meaning to use the default handling defined for the codec. Default
374 error handling for all builtin codecs is "strict" (ValueErrors are
375 raised).
376
377 The codecs all use a similar interface. Only deviation from the
378 generic ones are documented.
379
380*/
381
Fred Drakecb093fe2000-05-09 19:51:53 +0000382/* --- Manage the default encoding ---------------------------------------- */
383
384/* Returns the currently active default encoding.
385
386 The default encoding is currently implemented as run-time settable
387 process global. This may change in future versions of the
388 interpreter to become a parameter which is managed on a per-thread
389 basis.
390
391 */
392
Thomas Wouters5f375912000-07-22 23:30:03 +0000393extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000394
395/* Sets the currently active default encoding.
396
397 Returns 0 on success, -1 in case of an error.
398
399 */
400
401extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
402 const char *encoding /* Encoding name in standard form */
403 );
404
Guido van Rossumd8225182000-03-10 22:33:05 +0000405/* --- Generic Codecs ----------------------------------------------------- */
406
407/* Create a Unicode object by decoding the encoded string s of the
408 given size. */
409
410extern DL_IMPORT(PyObject*) PyUnicode_Decode(
411 const char *s, /* encoded string */
412 int size, /* size of buffer */
413 const char *encoding, /* encoding */
414 const char *errors /* error handling */
415 );
416
417/* Encodes a Py_UNICODE buffer of the given size and returns a
418 Python string object. */
419
420extern DL_IMPORT(PyObject*) PyUnicode_Encode(
421 const Py_UNICODE *s, /* Unicode char buffer */
422 int size, /* number of Py_UNICODE chars to encode */
423 const char *encoding, /* encoding */
424 const char *errors /* error handling */
425 );
426
427/* Encodes a Unicode object and returns the result as Python string
428 object. */
429
430extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
431 PyObject *unicode, /* Unicode object */
432 const char *encoding, /* encoding */
433 const char *errors /* error handling */
434 );
435
436/* --- UTF-8 Codecs ------------------------------------------------------- */
437
438extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
439 const char *string, /* UTF-8 encoded string */
440 int length, /* size of string */
441 const char *errors /* error handling */
442 );
443
444extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
445 PyObject *unicode /* Unicode object */
446 );
447
448extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
449 const Py_UNICODE *data, /* Unicode char buffer */
450 int length, /* number of Py_UNICODE chars to encode */
451 const char *errors /* error handling */
452 );
453
454/* --- UTF-16 Codecs ------------------------------------------------------ */
455
Guido van Rossum9e896b32000-04-05 20:11:21 +0000456/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000457 the corresponding Unicode object.
458
459 errors (if non-NULL) defines the error handling. It defaults
460 to "strict".
461
462 If byteorder is non-NULL, the decoder starts decoding using the
463 given byte order:
464
465 *byteorder == -1: little endian
466 *byteorder == 0: native order
467 *byteorder == 1: big endian
468
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000469 In native mode, the first two bytes of the stream are checked for a
470 BOM mark. If found, the BOM mark is analysed, the byte order
471 adjusted and the BOM skipped. In the other modes, no BOM mark
472 interpretation is done. After completion, *byteorder is set to the
473 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000474
475 If byteorder is NULL, the codec starts in native order mode.
476
477*/
478
479extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
480 const char *string, /* UTF-16 encoded string */
481 int length, /* size of string */
482 const char *errors, /* error handling */
483 int *byteorder /* pointer to byteorder to use
484 0=native;-1=LE,1=BE; updated on
485 exit */
486 );
487
488/* Returns a Python string using the UTF-16 encoding in native byte
489 order. The string always starts with a BOM mark. */
490
491extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
492 PyObject *unicode /* Unicode object */
493 );
494
495/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000496 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000497
498 If byteorder is not 0, output is written according to the following
499 byte order:
500
501 byteorder == -1: little endian
502 byteorder == 0: native byte order (writes a BOM mark)
503 byteorder == 1: big endian
504
505 If byteorder is 0, the output string will always start with the
506 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
507 prepended.
508
509 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
510 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000511 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000512
513*/
514
515extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
516 const Py_UNICODE *data, /* Unicode char buffer */
517 int length, /* number of Py_UNICODE chars to encode */
518 const char *errors, /* error handling */
519 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
520 );
521
522/* --- Unicode-Escape Codecs ---------------------------------------------- */
523
524extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
525 const char *string, /* Unicode-Escape encoded string */
526 int length, /* size of string */
527 const char *errors /* error handling */
528 );
529
530extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
531 PyObject *unicode /* Unicode object */
532 );
533
534extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
535 const Py_UNICODE *data, /* Unicode char buffer */
536 int length /* Number of Py_UNICODE chars to encode */
537 );
538
539/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
540
541extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
542 const char *string, /* Raw-Unicode-Escape encoded string */
543 int length, /* size of string */
544 const char *errors /* error handling */
545 );
546
547extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
548 PyObject *unicode /* Unicode object */
549 );
550
551extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
552 const Py_UNICODE *data, /* Unicode char buffer */
553 int length /* Number of Py_UNICODE chars to encode */
554 );
555
556/* --- Latin-1 Codecs -----------------------------------------------------
557
558 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
559
560*/
561
562extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
563 const char *string, /* Latin-1 encoded string */
564 int length, /* size of string */
565 const char *errors /* error handling */
566 );
567
568extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
569 PyObject *unicode /* Unicode object */
570 );
571
572extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
573 const Py_UNICODE *data, /* Unicode char buffer */
574 int length, /* Number of Py_UNICODE chars to encode */
575 const char *errors /* error handling */
576 );
577
578/* --- ASCII Codecs -------------------------------------------------------
579
580 Only 7-bit ASCII data is excepted. All other codes generate errors.
581
582*/
583
584extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
585 const char *string, /* ASCII encoded string */
586 int length, /* size of string */
587 const char *errors /* error handling */
588 );
589
590extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
591 PyObject *unicode /* Unicode object */
592 );
593
594extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
595 const Py_UNICODE *data, /* Unicode char buffer */
596 int length, /* Number of Py_UNICODE chars to encode */
597 const char *errors /* error handling */
598 );
599
600/* --- Character Map Codecs -----------------------------------------------
601
602 This codec uses mappings to encode and decode characters.
603
604 Decoding mappings must map single string characters to single
605 Unicode characters, integers (which are then interpreted as Unicode
606 ordinals) or None (meaning "undefined mapping" and causing an
607 error).
608
609 Encoding mappings must map single Unicode characters to single
610 string characters, integers (which are then interpreted as Latin-1
611 ordinals) or None (meaning "undefined mapping" and causing an
612 error).
613
614 If a character lookup fails with a LookupError, the character is
615 copied as-is meaning that its ordinal value will be interpreted as
616 Unicode or Latin-1 ordinal resp. Because of this mappings only need
617 to contain those mappings which map characters to different code
618 points.
619
620*/
621
622extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
623 const char *string, /* Encoded string */
624 int length, /* size of string */
625 PyObject *mapping, /* character mapping
626 (char ordinal -> unicode ordinal) */
627 const char *errors /* error handling */
628 );
629
630extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
631 PyObject *unicode, /* Unicode object */
632 PyObject *mapping /* character mapping
633 (unicode ordinal -> char ordinal) */
634 );
635
636extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
637 const Py_UNICODE *data, /* Unicode char buffer */
638 int length, /* Number of Py_UNICODE chars to encode */
639 PyObject *mapping, /* character mapping
640 (unicode ordinal -> char ordinal) */
641 const char *errors /* error handling */
642 );
643
644/* Translate a Py_UNICODE buffer of the given length by applying a
645 character mapping table to it and return the resulting Unicode
646 object.
647
648 The mapping table must map Unicode ordinal integers to Unicode
649 ordinal integers or None (causing deletion of the character).
650
651 Mapping tables may be dictionaries or sequences. Unmapped character
652 ordinals (ones which cause a LookupError) are left untouched and
653 are copied as-is.
654
655*/
656
657extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
658 const Py_UNICODE *data, /* Unicode char buffer */
659 int length, /* Number of Py_UNICODE chars to encode */
660 PyObject *table, /* Translate table */
661 const char *errors /* error handling */
662 );
663
Guido van Rossumefec1152000-03-28 02:01:15 +0000664#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000665
Guido van Rossumefec1152000-03-28 02:01:15 +0000666/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000667
Guido van Rossumefec1152000-03-28 02:01:15 +0000668extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
669 const char *string, /* MBCS encoded string */
670 int length, /* size of string */
671 const char *errors /* error handling */
672 );
673
674extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
675 PyObject *unicode /* Unicode object */
676 );
677
678extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
679 const Py_UNICODE *data, /* Unicode char buffer */
680 int length, /* Number of Py_UNICODE chars to encode */
681 const char *errors /* error handling */
682 );
683
Guido van Rossumefec1152000-03-28 02:01:15 +0000684#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000685
Guido van Rossum9e896b32000-04-05 20:11:21 +0000686/* --- Decimal Encoder ---------------------------------------------------- */
687
688/* Takes a Unicode string holding a decimal value and writes it into
689 an output buffer using standard ASCII digit codes.
690
691 The output buffer has to provide at least length+1 bytes of storage
692 area. The output string is 0-terminated.
693
694 The encoder converts whitespace to ' ', decimal characters to their
695 corresponding ASCII digit and all other Latin-1 characters except
696 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
697 are treated as errors. This includes embedded NULL bytes.
698
699 Error handling is defined by the errors argument:
700
701 NULL or "strict": raise a ValueError
702 "ignore": ignore the wrong characters (these are not copied to the
703 output buffer)
704 "replace": replaces illegal characters with '?'
705
706 Returns 0 on success, -1 on failure.
707
708*/
709
710extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
711 Py_UNICODE *s, /* Unicode buffer */
712 int length, /* Number of Py_UNICODE chars to encode */
713 char *output, /* Output buffer; must have size >= length */
714 const char *errors /* error handling */
715 );
716
Guido van Rossumd8225182000-03-10 22:33:05 +0000717/* --- Methods & Slots ----------------------------------------------------
718
719 These are capable of handling Unicode objects and strings on input
720 (we refer to them as strings in the descriptions) and return
721 Unicode objects or integers as apporpriate. */
722
723/* Concat two strings giving a new Unicode string. */
724
725extern DL_IMPORT(PyObject*) PyUnicode_Concat(
726 PyObject *left, /* Left string */
727 PyObject *right /* Right string */
728 );
729
730/* Split a string giving a list of Unicode strings.
731
732 If sep is NULL, splitting will be done at all whitespace
733 substrings. Otherwise, splits occur at the given separator.
734
735 At most maxsplit splits will be done. If negative, no limit is set.
736
737 Separators are not included in the resulting list.
738
739*/
740
741extern DL_IMPORT(PyObject*) PyUnicode_Split(
742 PyObject *s, /* String to split */
743 PyObject *sep, /* String separator */
744 int maxsplit /* Maxsplit count */
745 );
746
747/* Dito, but split at line breaks.
748
749 CRLF is considered to be one line break. Line breaks are not
750 included in the resulting list. */
751
752extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
753 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000754 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000755 );
756
757/* Translate a string by applying a character mapping table to it and
758 return the resulting Unicode object.
759
760 The mapping table must map Unicode ordinal integers to Unicode
761 ordinal integers or None (causing deletion of the character).
762
763 Mapping tables may be dictionaries or sequences. Unmapped character
764 ordinals (ones which cause a LookupError) are left untouched and
765 are copied as-is.
766
767*/
768
769extern DL_IMPORT(PyObject *) PyUnicode_Translate(
770 PyObject *str, /* String */
771 PyObject *table, /* Translate table */
772 const char *errors /* error handling */
773 );
774
775/* Join a sequence of strings using the given separator and return
776 the resulting Unicode string. */
777
778extern DL_IMPORT(PyObject*) PyUnicode_Join(
779 PyObject *separator, /* Separator string */
780 PyObject *seq /* Sequence object */
781 );
782
783/* Return 1 if substr matches str[start:end] at the given tail end, 0
784 otherwise. */
785
786extern DL_IMPORT(int) PyUnicode_Tailmatch(
787 PyObject *str, /* String */
788 PyObject *substr, /* Prefix or Suffix string */
789 int start, /* Start index */
790 int end, /* Stop index */
791 int direction /* Tail end: -1 prefix, +1 suffix */
792 );
793
794/* Return the first position of substr in str[start:end] using the
795 given search direction or -1 if not found. */
796
797extern DL_IMPORT(int) PyUnicode_Find(
798 PyObject *str, /* String */
799 PyObject *substr, /* Substring to find */
800 int start, /* Start index */
801 int end, /* Stop index */
802 int direction /* Find direction: +1 forward, -1 backward */
803 );
804
Barry Warsaw51ac5802000-03-20 16:36:48 +0000805/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000806
807extern DL_IMPORT(int) PyUnicode_Count(
808 PyObject *str, /* String */
809 PyObject *substr, /* Substring to count */
810 int start, /* Start index */
811 int end /* Stop index */
812 );
813
Barry Warsaw51ac5802000-03-20 16:36:48 +0000814/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000815 and return the resulting Unicode object. */
816
817extern DL_IMPORT(PyObject *) PyUnicode_Replace(
818 PyObject *str, /* String */
819 PyObject *substr, /* Substring to find */
820 PyObject *replstr, /* Substring to replace */
821 int maxcount /* Max. number of replacements to apply;
822 -1 = all */
823 );
824
825/* Compare two strings and return -1, 0, 1 for less than, equal,
826 greater than resp. */
827
828extern DL_IMPORT(int) PyUnicode_Compare(
829 PyObject *left, /* Left string */
830 PyObject *right /* Right string */
831 );
832
Thomas Wouters7e474022000-07-16 12:04:32 +0000833/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000834 the resulting Unicode string. */
835
836extern DL_IMPORT(PyObject *) PyUnicode_Format(
837 PyObject *format, /* Format string */
838 PyObject *args /* Argument tuple or dictionary */
839 );
840
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000841/* Checks whether element is contained in container and return 1/0
842 accordingly.
843
844 element has to coerce to an one element Unicode string. -1 is
845 returned in case of an error. */
846
847extern DL_IMPORT(int) PyUnicode_Contains(
848 PyObject *container, /* Container string */
849 PyObject *element /* Element string */
850 );
851
Guido van Rossumd8225182000-03-10 22:33:05 +0000852/* === Characters Type APIs =============================================== */
853
854/* These should not be used directly. Use the Py_UNICODE_IS* and
855 Py_UNICODE_TO* macros instead.
856
857 These APIs are implemented in Objects/unicodectype.c.
858
859*/
860
861extern DL_IMPORT(int) _PyUnicode_IsLowercase(
862 register const Py_UNICODE ch /* Unicode character */
863 );
864
865extern DL_IMPORT(int) _PyUnicode_IsUppercase(
866 register const Py_UNICODE ch /* Unicode character */
867 );
868
869extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
870 register const Py_UNICODE ch /* Unicode character */
871 );
872
873extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
874 register const Py_UNICODE ch /* Unicode character */
875 );
876
877extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
878 register const Py_UNICODE ch /* Unicode character */
879 );
880
881extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
882 register const Py_UNICODE ch /* Unicode character */
883 );
884
885extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
886 register const Py_UNICODE ch /* Unicode character */
887 );
888
889extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
890 register const Py_UNICODE ch /* Unicode character */
891 );
892
893extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
894 register const Py_UNICODE ch /* Unicode character */
895 );
896
897extern DL_IMPORT(int) _PyUnicode_ToDigit(
898 register const Py_UNICODE ch /* Unicode character */
899 );
900
901extern DL_IMPORT(double) _PyUnicode_ToNumeric(
902 register const Py_UNICODE ch /* Unicode character */
903 );
904
905extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
906 register const Py_UNICODE ch /* Unicode character */
907 );
908
909extern DL_IMPORT(int) _PyUnicode_IsDigit(
910 register const Py_UNICODE ch /* Unicode character */
911 );
912
913extern DL_IMPORT(int) _PyUnicode_IsNumeric(
914 register const Py_UNICODE ch /* Unicode character */
915 );
916
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000917extern DL_IMPORT(int) _PyUnicode_IsAlpha(
918 register const Py_UNICODE ch /* Unicode character */
919 );
920
Guido van Rossumd8225182000-03-10 22:33:05 +0000921#ifdef __cplusplus
922}
923#endif
924#endif /* !Py_UNICODEOBJECT_H */