blob: 41fffc0cd0d6490f6a5032f16e4f4a3cee5ea2ad [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
Marc-André Lemburg43279102000-07-07 09:01:41 +0000110/*
111 * Use this typedef when you need to represent a UTF-16 surrogate pair
112 * as single unsigned integer.
113 */
114#if SIZEOF_INT >= 4
115typedef unsigned int Py_UCS4;
116#elif SIZEOF_LONG >= 4
117typedef unsigned long Py_UCS4;
118#endif
119
120
Guido van Rossumd8225182000-03-10 22:33:05 +0000121/* --- Internal Unicode Operations ---------------------------------------- */
122
123/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000124 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
125 configure Python using --with-ctype-functions. This reduces the
126 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000127
128#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
129
130#include "wctype.h"
131
132#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
133
134#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
135#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
136#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
137#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
138
139#define Py_UNICODE_TOLOWER(ch) towlower(ch)
140#define Py_UNICODE_TOUPPER(ch) towupper(ch)
141#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
142
143#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
144#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
145#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
146
147#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
148#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
149#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
150
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000151#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
152
Guido van Rossumd8225182000-03-10 22:33:05 +0000153#else
154
155#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
156
157#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
158#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
159#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
160#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
161
162#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
163#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
164#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
165
166#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
167#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
168#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
169
170#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
171#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
172#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
173
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000174#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000175
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000176#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000177
178#define Py_UNICODE_ISALNUM(ch) \
179 (Py_UNICODE_ISALPHA(ch) || \
180 Py_UNICODE_ISDECIMAL(ch) || \
181 Py_UNICODE_ISDIGIT(ch) || \
182 Py_UNICODE_ISNUMERIC(ch))
183
Guido van Rossumd8225182000-03-10 22:33:05 +0000184#define Py_UNICODE_COPY(target, source, length)\
185 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
186
187#define Py_UNICODE_FILL(target, value, length) do\
188 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
189 while (0)
190
191#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000192 ((*((string)->str + (offset)) == *((substring)->str)) &&\
193 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000194 (substring)->length*sizeof(Py_UNICODE)))
195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
202typedef struct {
203 PyObject_HEAD
204 int length; /* Length of raw Unicode data in buffer */
205 Py_UNICODE *str; /* Raw Unicode buffer */
206 long hash; /* Hash value; -1 if not set */
207 PyObject *utf8str; /* UTF-8 encoded version as Python string,
208 or NULL */
209} PyUnicodeObject;
210
211extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
212
213#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
214
215/* Fast access macros */
216#define PyUnicode_GET_SIZE(op) \
217 (((PyUnicodeObject *)(op))->length)
218#define PyUnicode_GET_DATA_SIZE(op) \
219 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
220#define PyUnicode_AS_UNICODE(op) \
221 (((PyUnicodeObject *)(op))->str)
222#define PyUnicode_AS_DATA(op) \
223 ((const char *)((PyUnicodeObject *)(op))->str)
224
225/* --- Constants ---------------------------------------------------------- */
226
227/* This Unicode character will be used as replacement character during
228 decoding if the errors argument is set to "replace". Note: the
229 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
230 Unicode 3.0. */
231
232#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
233
234/* === Public API ========================================================= */
235
236/* --- Plain Py_UNICODE --------------------------------------------------- */
237
238/* Create a Unicode Object from the Py_UNICODE buffer u of the given
239 size. u may be NULL which causes the contents to be undefined. It
240 is the user's responsibility to fill in the needed data.
241
242 The buffer is copied into the new object. */
243
244extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
245 const Py_UNICODE *u, /* Unicode buffer */
246 int size /* size of buffer */
247 );
248
249/* Return a read-only pointer to the Unicode object's internal
250 Py_UNICODE buffer. */
251
252extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
253 PyObject *unicode /* Unicode object */
254 );
255
256/* Get the length of the Unicode object. */
257
258extern DL_IMPORT(int) PyUnicode_GetSize(
259 PyObject *unicode /* Unicode object */
260 );
261
Guido van Rossum52c23592000-04-10 13:41:41 +0000262/* Resize an already allocated Unicode object to the new size length.
263
264 *unicode is modified to point to the new (resized) object and 0
265 returned on success.
266
267 This API may only be called by the function which also called the
268 Unicode constructor. The refcount on the object must be 1. Otherwise,
269 an error is returned.
270
271 Error handling is implemented as follows: an exception is set, -1
272 is returned and *unicode left untouched.
273
274*/
275
276extern DL_IMPORT(int) PyUnicode_Resize(
277 PyObject **unicode, /* Pointer to the Unicode object */
278 int length /* New length */
279 );
280
Guido van Rossumd8225182000-03-10 22:33:05 +0000281/* Coerce obj to an Unicode object and return a reference with
282 *incremented* refcount.
283
284 Coercion is done in the following way:
285
286 1. Unicode objects are passed back as-is with incremented
287 refcount.
288
289 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000290 under the assumptions that they contain data using the current
291 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000292
293 3. All other objects raise an exception.
294
295 The API returns NULL in case of an error. The caller is responsible
296 for decref'ing the returned objects.
297
298*/
299
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000300extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
301 register PyObject *obj, /* Object */
302 const char *encoding, /* encoding */
303 const char *errors /* error handling */
304 );
305
306/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
307 which results in using the default encoding as basis for
308 decoding the object.
309
310 Coerces obj to an Unicode object and return a reference with
311 *incremented* refcount.
312
313 The API returns NULL in case of an error. The caller is responsible
314 for decref'ing the returned objects.
315
316*/
317
Guido van Rossumd8225182000-03-10 22:33:05 +0000318extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
319 register PyObject *obj /* Object */
320 );
321
322/* --- wchar_t support for platforms which support it --------------------- */
323
324#ifdef HAVE_WCHAR_H
325
326/* Create a Unicode Object from the whcar_t buffer w of the given
327 size.
328
329 The buffer is copied into the new object. */
330
331extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
332 register const wchar_t *w, /* wchar_t buffer */
333 int size /* size of buffer */
334 );
335
336/* Copies the Unicode Object contents into the whcar_t buffer w. At
337 most size wchar_t characters are copied.
338
339 Returns the number of wchar_t characters copied or -1 in case of an
340 error. */
341
342extern DL_IMPORT(int) PyUnicode_AsWideChar(
343 PyUnicodeObject *unicode, /* Unicode object */
344 register wchar_t *w, /* wchar_t buffer */
345 int size /* size of buffer */
346 );
347
348#endif
349
350/* === Builtin Codecs =====================================================
351
352 Many of these APIs take two arguments encoding and errors. These
353 parameters encoding and errors have the same semantics as the ones
354 of the builtin unicode() API.
355
Fred Drakecb093fe2000-05-09 19:51:53 +0000356 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000357
358 Error handling is set by errors which may also be set to NULL
359 meaning to use the default handling defined for the codec. Default
360 error handling for all builtin codecs is "strict" (ValueErrors are
361 raised).
362
363 The codecs all use a similar interface. Only deviation from the
364 generic ones are documented.
365
366*/
367
Fred Drakecb093fe2000-05-09 19:51:53 +0000368/* --- Manage the default encoding ---------------------------------------- */
369
370/* Returns the currently active default encoding.
371
372 The default encoding is currently implemented as run-time settable
373 process global. This may change in future versions of the
374 interpreter to become a parameter which is managed on a per-thread
375 basis.
376
377 */
378
379extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
380
381/* Sets the currently active default encoding.
382
383 Returns 0 on success, -1 in case of an error.
384
385 */
386
387extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
388 const char *encoding /* Encoding name in standard form */
389 );
390
Guido van Rossumd8225182000-03-10 22:33:05 +0000391/* --- Generic Codecs ----------------------------------------------------- */
392
393/* Create a Unicode object by decoding the encoded string s of the
394 given size. */
395
396extern DL_IMPORT(PyObject*) PyUnicode_Decode(
397 const char *s, /* encoded string */
398 int size, /* size of buffer */
399 const char *encoding, /* encoding */
400 const char *errors /* error handling */
401 );
402
403/* Encodes a Py_UNICODE buffer of the given size and returns a
404 Python string object. */
405
406extern DL_IMPORT(PyObject*) PyUnicode_Encode(
407 const Py_UNICODE *s, /* Unicode char buffer */
408 int size, /* number of Py_UNICODE chars to encode */
409 const char *encoding, /* encoding */
410 const char *errors /* error handling */
411 );
412
413/* Encodes a Unicode object and returns the result as Python string
414 object. */
415
416extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
417 PyObject *unicode, /* Unicode object */
418 const char *encoding, /* encoding */
419 const char *errors /* error handling */
420 );
421
422/* --- UTF-8 Codecs ------------------------------------------------------- */
423
424extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
425 const char *string, /* UTF-8 encoded string */
426 int length, /* size of string */
427 const char *errors /* error handling */
428 );
429
430extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
431 PyObject *unicode /* Unicode object */
432 );
433
434extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
435 const Py_UNICODE *data, /* Unicode char buffer */
436 int length, /* number of Py_UNICODE chars to encode */
437 const char *errors /* error handling */
438 );
439
440/* --- UTF-16 Codecs ------------------------------------------------------ */
441
Guido van Rossum9e896b32000-04-05 20:11:21 +0000442/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000443 the corresponding Unicode object.
444
445 errors (if non-NULL) defines the error handling. It defaults
446 to "strict".
447
448 If byteorder is non-NULL, the decoder starts decoding using the
449 given byte order:
450
451 *byteorder == -1: little endian
452 *byteorder == 0: native order
453 *byteorder == 1: big endian
454
455 and then switches according to all BOM marks it finds in the input
456 data. BOM marks are not copied into the resulting Unicode string.
457 After completion, *byteorder is set to the current byte order at
458 the end of input data.
459
460 If byteorder is NULL, the codec starts in native order mode.
461
462*/
463
464extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
465 const char *string, /* UTF-16 encoded string */
466 int length, /* size of string */
467 const char *errors, /* error handling */
468 int *byteorder /* pointer to byteorder to use
469 0=native;-1=LE,1=BE; updated on
470 exit */
471 );
472
473/* Returns a Python string using the UTF-16 encoding in native byte
474 order. The string always starts with a BOM mark. */
475
476extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
477 PyObject *unicode /* Unicode object */
478 );
479
480/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000481 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000482
483 If byteorder is not 0, output is written according to the following
484 byte order:
485
486 byteorder == -1: little endian
487 byteorder == 0: native byte order (writes a BOM mark)
488 byteorder == 1: big endian
489
490 If byteorder is 0, the output string will always start with the
491 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
492 prepended.
493
494 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
495 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000496 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000497
498*/
499
500extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
501 const Py_UNICODE *data, /* Unicode char buffer */
502 int length, /* number of Py_UNICODE chars to encode */
503 const char *errors, /* error handling */
504 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
505 );
506
507/* --- Unicode-Escape Codecs ---------------------------------------------- */
508
509extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
510 const char *string, /* Unicode-Escape encoded string */
511 int length, /* size of string */
512 const char *errors /* error handling */
513 );
514
515extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
516 PyObject *unicode /* Unicode object */
517 );
518
519extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
520 const Py_UNICODE *data, /* Unicode char buffer */
521 int length /* Number of Py_UNICODE chars to encode */
522 );
523
524/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
525
526extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
527 const char *string, /* Raw-Unicode-Escape encoded string */
528 int length, /* size of string */
529 const char *errors /* error handling */
530 );
531
532extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
533 PyObject *unicode /* Unicode object */
534 );
535
536extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
537 const Py_UNICODE *data, /* Unicode char buffer */
538 int length /* Number of Py_UNICODE chars to encode */
539 );
540
541/* --- Latin-1 Codecs -----------------------------------------------------
542
543 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
544
545*/
546
547extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
548 const char *string, /* Latin-1 encoded string */
549 int length, /* size of string */
550 const char *errors /* error handling */
551 );
552
553extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
554 PyObject *unicode /* Unicode object */
555 );
556
557extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
558 const Py_UNICODE *data, /* Unicode char buffer */
559 int length, /* Number of Py_UNICODE chars to encode */
560 const char *errors /* error handling */
561 );
562
563/* --- ASCII Codecs -------------------------------------------------------
564
565 Only 7-bit ASCII data is excepted. All other codes generate errors.
566
567*/
568
569extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
570 const char *string, /* ASCII encoded string */
571 int length, /* size of string */
572 const char *errors /* error handling */
573 );
574
575extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
576 PyObject *unicode /* Unicode object */
577 );
578
579extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
580 const Py_UNICODE *data, /* Unicode char buffer */
581 int length, /* Number of Py_UNICODE chars to encode */
582 const char *errors /* error handling */
583 );
584
585/* --- Character Map Codecs -----------------------------------------------
586
587 This codec uses mappings to encode and decode characters.
588
589 Decoding mappings must map single string characters to single
590 Unicode characters, integers (which are then interpreted as Unicode
591 ordinals) or None (meaning "undefined mapping" and causing an
592 error).
593
594 Encoding mappings must map single Unicode characters to single
595 string characters, integers (which are then interpreted as Latin-1
596 ordinals) or None (meaning "undefined mapping" and causing an
597 error).
598
599 If a character lookup fails with a LookupError, the character is
600 copied as-is meaning that its ordinal value will be interpreted as
601 Unicode or Latin-1 ordinal resp. Because of this mappings only need
602 to contain those mappings which map characters to different code
603 points.
604
605*/
606
607extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
608 const char *string, /* Encoded string */
609 int length, /* size of string */
610 PyObject *mapping, /* character mapping
611 (char ordinal -> unicode ordinal) */
612 const char *errors /* error handling */
613 );
614
615extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
616 PyObject *unicode, /* Unicode object */
617 PyObject *mapping /* character mapping
618 (unicode ordinal -> char ordinal) */
619 );
620
621extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
622 const Py_UNICODE *data, /* Unicode char buffer */
623 int length, /* Number of Py_UNICODE chars to encode */
624 PyObject *mapping, /* character mapping
625 (unicode ordinal -> char ordinal) */
626 const char *errors /* error handling */
627 );
628
629/* Translate a Py_UNICODE buffer of the given length by applying a
630 character mapping table to it and return the resulting Unicode
631 object.
632
633 The mapping table must map Unicode ordinal integers to Unicode
634 ordinal integers or None (causing deletion of the character).
635
636 Mapping tables may be dictionaries or sequences. Unmapped character
637 ordinals (ones which cause a LookupError) are left untouched and
638 are copied as-is.
639
640*/
641
642extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
643 const Py_UNICODE *data, /* Unicode char buffer */
644 int length, /* Number of Py_UNICODE chars to encode */
645 PyObject *table, /* Translate table */
646 const char *errors /* error handling */
647 );
648
Guido van Rossumefec1152000-03-28 02:01:15 +0000649#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000650
Guido van Rossumefec1152000-03-28 02:01:15 +0000651/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000652
Guido van Rossumefec1152000-03-28 02:01:15 +0000653extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
654 const char *string, /* MBCS encoded string */
655 int length, /* size of string */
656 const char *errors /* error handling */
657 );
658
659extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
660 PyObject *unicode /* Unicode object */
661 );
662
663extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
664 const Py_UNICODE *data, /* Unicode char buffer */
665 int length, /* Number of Py_UNICODE chars to encode */
666 const char *errors /* error handling */
667 );
668
Guido van Rossumefec1152000-03-28 02:01:15 +0000669#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000670
Guido van Rossum9e896b32000-04-05 20:11:21 +0000671/* --- Decimal Encoder ---------------------------------------------------- */
672
673/* Takes a Unicode string holding a decimal value and writes it into
674 an output buffer using standard ASCII digit codes.
675
676 The output buffer has to provide at least length+1 bytes of storage
677 area. The output string is 0-terminated.
678
679 The encoder converts whitespace to ' ', decimal characters to their
680 corresponding ASCII digit and all other Latin-1 characters except
681 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
682 are treated as errors. This includes embedded NULL bytes.
683
684 Error handling is defined by the errors argument:
685
686 NULL or "strict": raise a ValueError
687 "ignore": ignore the wrong characters (these are not copied to the
688 output buffer)
689 "replace": replaces illegal characters with '?'
690
691 Returns 0 on success, -1 on failure.
692
693*/
694
695extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
696 Py_UNICODE *s, /* Unicode buffer */
697 int length, /* Number of Py_UNICODE chars to encode */
698 char *output, /* Output buffer; must have size >= length */
699 const char *errors /* error handling */
700 );
701
Guido van Rossumd8225182000-03-10 22:33:05 +0000702/* --- Methods & Slots ----------------------------------------------------
703
704 These are capable of handling Unicode objects and strings on input
705 (we refer to them as strings in the descriptions) and return
706 Unicode objects or integers as apporpriate. */
707
708/* Concat two strings giving a new Unicode string. */
709
710extern DL_IMPORT(PyObject*) PyUnicode_Concat(
711 PyObject *left, /* Left string */
712 PyObject *right /* Right string */
713 );
714
715/* Split a string giving a list of Unicode strings.
716
717 If sep is NULL, splitting will be done at all whitespace
718 substrings. Otherwise, splits occur at the given separator.
719
720 At most maxsplit splits will be done. If negative, no limit is set.
721
722 Separators are not included in the resulting list.
723
724*/
725
726extern DL_IMPORT(PyObject*) PyUnicode_Split(
727 PyObject *s, /* String to split */
728 PyObject *sep, /* String separator */
729 int maxsplit /* Maxsplit count */
730 );
731
732/* Dito, but split at line breaks.
733
734 CRLF is considered to be one line break. Line breaks are not
735 included in the resulting list. */
736
737extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
738 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000739 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000740 );
741
742/* Translate a string by applying a character mapping table to it and
743 return the resulting Unicode object.
744
745 The mapping table must map Unicode ordinal integers to Unicode
746 ordinal integers or None (causing deletion of the character).
747
748 Mapping tables may be dictionaries or sequences. Unmapped character
749 ordinals (ones which cause a LookupError) are left untouched and
750 are copied as-is.
751
752*/
753
754extern DL_IMPORT(PyObject *) PyUnicode_Translate(
755 PyObject *str, /* String */
756 PyObject *table, /* Translate table */
757 const char *errors /* error handling */
758 );
759
760/* Join a sequence of strings using the given separator and return
761 the resulting Unicode string. */
762
763extern DL_IMPORT(PyObject*) PyUnicode_Join(
764 PyObject *separator, /* Separator string */
765 PyObject *seq /* Sequence object */
766 );
767
768/* Return 1 if substr matches str[start:end] at the given tail end, 0
769 otherwise. */
770
771extern DL_IMPORT(int) PyUnicode_Tailmatch(
772 PyObject *str, /* String */
773 PyObject *substr, /* Prefix or Suffix string */
774 int start, /* Start index */
775 int end, /* Stop index */
776 int direction /* Tail end: -1 prefix, +1 suffix */
777 );
778
779/* Return the first position of substr in str[start:end] using the
780 given search direction or -1 if not found. */
781
782extern DL_IMPORT(int) PyUnicode_Find(
783 PyObject *str, /* String */
784 PyObject *substr, /* Substring to find */
785 int start, /* Start index */
786 int end, /* Stop index */
787 int direction /* Find direction: +1 forward, -1 backward */
788 );
789
Barry Warsaw51ac5802000-03-20 16:36:48 +0000790/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000791
792extern DL_IMPORT(int) PyUnicode_Count(
793 PyObject *str, /* String */
794 PyObject *substr, /* Substring to count */
795 int start, /* Start index */
796 int end /* Stop index */
797 );
798
Barry Warsaw51ac5802000-03-20 16:36:48 +0000799/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000800 and return the resulting Unicode object. */
801
802extern DL_IMPORT(PyObject *) PyUnicode_Replace(
803 PyObject *str, /* String */
804 PyObject *substr, /* Substring to find */
805 PyObject *replstr, /* Substring to replace */
806 int maxcount /* Max. number of replacements to apply;
807 -1 = all */
808 );
809
810/* Compare two strings and return -1, 0, 1 for less than, equal,
811 greater than resp. */
812
813extern DL_IMPORT(int) PyUnicode_Compare(
814 PyObject *left, /* Left string */
815 PyObject *right /* Right string */
816 );
817
Thomas Wouters7e474022000-07-16 12:04:32 +0000818/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000819 the resulting Unicode string. */
820
821extern DL_IMPORT(PyObject *) PyUnicode_Format(
822 PyObject *format, /* Format string */
823 PyObject *args /* Argument tuple or dictionary */
824 );
825
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000826/* Checks whether element is contained in container and return 1/0
827 accordingly.
828
829 element has to coerce to an one element Unicode string. -1 is
830 returned in case of an error. */
831
832extern DL_IMPORT(int) PyUnicode_Contains(
833 PyObject *container, /* Container string */
834 PyObject *element /* Element string */
835 );
836
Guido van Rossumd8225182000-03-10 22:33:05 +0000837/* === Characters Type APIs =============================================== */
838
839/* These should not be used directly. Use the Py_UNICODE_IS* and
840 Py_UNICODE_TO* macros instead.
841
842 These APIs are implemented in Objects/unicodectype.c.
843
844*/
845
846extern DL_IMPORT(int) _PyUnicode_IsLowercase(
847 register const Py_UNICODE ch /* Unicode character */
848 );
849
850extern DL_IMPORT(int) _PyUnicode_IsUppercase(
851 register const Py_UNICODE ch /* Unicode character */
852 );
853
854extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
855 register const Py_UNICODE ch /* Unicode character */
856 );
857
858extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
859 register const Py_UNICODE ch /* Unicode character */
860 );
861
862extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
863 register const Py_UNICODE ch /* Unicode character */
864 );
865
866extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
867 register const Py_UNICODE ch /* Unicode character */
868 );
869
870extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
871 register const Py_UNICODE ch /* Unicode character */
872 );
873
874extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
875 register const Py_UNICODE ch /* Unicode character */
876 );
877
878extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
879 register const Py_UNICODE ch /* Unicode character */
880 );
881
882extern DL_IMPORT(int) _PyUnicode_ToDigit(
883 register const Py_UNICODE ch /* Unicode character */
884 );
885
886extern DL_IMPORT(double) _PyUnicode_ToNumeric(
887 register const Py_UNICODE ch /* Unicode character */
888 );
889
890extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
891 register const Py_UNICODE ch /* Unicode character */
892 );
893
894extern DL_IMPORT(int) _PyUnicode_IsDigit(
895 register const Py_UNICODE ch /* Unicode character */
896 );
897
898extern DL_IMPORT(int) _PyUnicode_IsNumeric(
899 register const Py_UNICODE ch /* Unicode character */
900 );
901
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000902extern DL_IMPORT(int) _PyUnicode_IsAlpha(
903 register const Py_UNICODE ch /* Unicode character */
904 );
905
Guido van Rossumd8225182000-03-10 22:33:05 +0000906#ifdef __cplusplus
907}
908#endif
909#endif /* !Py_UNICODEOBJECT_H */