blob: f076fae53bbc004ba3f015670fcccd0d4cdcb082 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114 configure Python using --with-ctype-functions. This reduces the
115 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#else
141
142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#endif
162
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000163#define Py_UNICODE_ISALPHA(ch) \
164 (Py_UNICODE_ISLOWER(ch) || \
165 Py_UNICODE_ISUPPER(ch) || \
166 Py_UNICODE_ISTITLE(ch))
167
168#define Py_UNICODE_ISALNUM(ch) \
169 (Py_UNICODE_ISALPHA(ch) || \
170 Py_UNICODE_ISDECIMAL(ch) || \
171 Py_UNICODE_ISDIGIT(ch) || \
172 Py_UNICODE_ISNUMERIC(ch))
173
Guido van Rossumd8225182000-03-10 22:33:05 +0000174#define Py_UNICODE_COPY(target, source, length)\
175 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
176
177#define Py_UNICODE_FILL(target, value, length) do\
178 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
179 while (0)
180
181#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000182 ((*((string)->str + (offset)) == *((substring)->str)) &&\
183 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000184 (substring)->length*sizeof(Py_UNICODE)))
185
Barry Warsaw51ac5802000-03-20 16:36:48 +0000186#ifdef __cplusplus
187extern "C" {
188#endif
189
Guido van Rossumd8225182000-03-10 22:33:05 +0000190/* --- Unicode Type ------------------------------------------------------- */
191
192typedef struct {
193 PyObject_HEAD
194 int length; /* Length of raw Unicode data in buffer */
195 Py_UNICODE *str; /* Raw Unicode buffer */
196 long hash; /* Hash value; -1 if not set */
197 PyObject *utf8str; /* UTF-8 encoded version as Python string,
198 or NULL */
199} PyUnicodeObject;
200
201extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
202
203#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
204
205/* Fast access macros */
206#define PyUnicode_GET_SIZE(op) \
207 (((PyUnicodeObject *)(op))->length)
208#define PyUnicode_GET_DATA_SIZE(op) \
209 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
210#define PyUnicode_AS_UNICODE(op) \
211 (((PyUnicodeObject *)(op))->str)
212#define PyUnicode_AS_DATA(op) \
213 ((const char *)((PyUnicodeObject *)(op))->str)
214
215/* --- Constants ---------------------------------------------------------- */
216
217/* This Unicode character will be used as replacement character during
218 decoding if the errors argument is set to "replace". Note: the
219 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
220 Unicode 3.0. */
221
222#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
223
224/* === Public API ========================================================= */
225
226/* --- Plain Py_UNICODE --------------------------------------------------- */
227
228/* Create a Unicode Object from the Py_UNICODE buffer u of the given
229 size. u may be NULL which causes the contents to be undefined. It
230 is the user's responsibility to fill in the needed data.
231
232 The buffer is copied into the new object. */
233
234extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
235 const Py_UNICODE *u, /* Unicode buffer */
236 int size /* size of buffer */
237 );
238
239/* Return a read-only pointer to the Unicode object's internal
240 Py_UNICODE buffer. */
241
242extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
243 PyObject *unicode /* Unicode object */
244 );
245
246/* Get the length of the Unicode object. */
247
248extern DL_IMPORT(int) PyUnicode_GetSize(
249 PyObject *unicode /* Unicode object */
250 );
251
Guido van Rossum52c23592000-04-10 13:41:41 +0000252/* Resize an already allocated Unicode object to the new size length.
253
254 *unicode is modified to point to the new (resized) object and 0
255 returned on success.
256
257 This API may only be called by the function which also called the
258 Unicode constructor. The refcount on the object must be 1. Otherwise,
259 an error is returned.
260
261 Error handling is implemented as follows: an exception is set, -1
262 is returned and *unicode left untouched.
263
264*/
265
266extern DL_IMPORT(int) PyUnicode_Resize(
267 PyObject **unicode, /* Pointer to the Unicode object */
268 int length /* New length */
269 );
270
Guido van Rossumd8225182000-03-10 22:33:05 +0000271/* Coerce obj to an Unicode object and return a reference with
272 *incremented* refcount.
273
274 Coercion is done in the following way:
275
276 1. Unicode objects are passed back as-is with incremented
277 refcount.
278
279 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000280 under the assumptions that they contain data using the current
281 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000282
283 3. All other objects raise an exception.
284
285 The API returns NULL in case of an error. The caller is responsible
286 for decref'ing the returned objects.
287
288*/
289
290extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
291 register PyObject *obj /* Object */
292 );
293
294/* --- wchar_t support for platforms which support it --------------------- */
295
296#ifdef HAVE_WCHAR_H
297
298/* Create a Unicode Object from the whcar_t buffer w of the given
299 size.
300
301 The buffer is copied into the new object. */
302
303extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
304 register const wchar_t *w, /* wchar_t buffer */
305 int size /* size of buffer */
306 );
307
308/* Copies the Unicode Object contents into the whcar_t buffer w. At
309 most size wchar_t characters are copied.
310
311 Returns the number of wchar_t characters copied or -1 in case of an
312 error. */
313
314extern DL_IMPORT(int) PyUnicode_AsWideChar(
315 PyUnicodeObject *unicode, /* Unicode object */
316 register wchar_t *w, /* wchar_t buffer */
317 int size /* size of buffer */
318 );
319
320#endif
321
322/* === Builtin Codecs =====================================================
323
324 Many of these APIs take two arguments encoding and errors. These
325 parameters encoding and errors have the same semantics as the ones
326 of the builtin unicode() API.
327
Fred Drakecb093fe2000-05-09 19:51:53 +0000328 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000329
330 Error handling is set by errors which may also be set to NULL
331 meaning to use the default handling defined for the codec. Default
332 error handling for all builtin codecs is "strict" (ValueErrors are
333 raised).
334
335 The codecs all use a similar interface. Only deviation from the
336 generic ones are documented.
337
338*/
339
Fred Drakecb093fe2000-05-09 19:51:53 +0000340/* --- Manage the default encoding ---------------------------------------- */
341
342/* Returns the currently active default encoding.
343
344 The default encoding is currently implemented as run-time settable
345 process global. This may change in future versions of the
346 interpreter to become a parameter which is managed on a per-thread
347 basis.
348
349 */
350
351extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
352
353/* Sets the currently active default encoding.
354
355 Returns 0 on success, -1 in case of an error.
356
357 */
358
359extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
360 const char *encoding /* Encoding name in standard form */
361 );
362
Guido van Rossumd8225182000-03-10 22:33:05 +0000363/* --- Generic Codecs ----------------------------------------------------- */
364
365/* Create a Unicode object by decoding the encoded string s of the
366 given size. */
367
368extern DL_IMPORT(PyObject*) PyUnicode_Decode(
369 const char *s, /* encoded string */
370 int size, /* size of buffer */
371 const char *encoding, /* encoding */
372 const char *errors /* error handling */
373 );
374
375/* Encodes a Py_UNICODE buffer of the given size and returns a
376 Python string object. */
377
378extern DL_IMPORT(PyObject*) PyUnicode_Encode(
379 const Py_UNICODE *s, /* Unicode char buffer */
380 int size, /* number of Py_UNICODE chars to encode */
381 const char *encoding, /* encoding */
382 const char *errors /* error handling */
383 );
384
385/* Encodes a Unicode object and returns the result as Python string
386 object. */
387
388extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
389 PyObject *unicode, /* Unicode object */
390 const char *encoding, /* encoding */
391 const char *errors /* error handling */
392 );
393
394/* --- UTF-8 Codecs ------------------------------------------------------- */
395
396extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
397 const char *string, /* UTF-8 encoded string */
398 int length, /* size of string */
399 const char *errors /* error handling */
400 );
401
402extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
403 PyObject *unicode /* Unicode object */
404 );
405
406extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
407 const Py_UNICODE *data, /* Unicode char buffer */
408 int length, /* number of Py_UNICODE chars to encode */
409 const char *errors /* error handling */
410 );
411
412/* --- UTF-16 Codecs ------------------------------------------------------ */
413
Guido van Rossum9e896b32000-04-05 20:11:21 +0000414/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000415 the corresponding Unicode object.
416
417 errors (if non-NULL) defines the error handling. It defaults
418 to "strict".
419
420 If byteorder is non-NULL, the decoder starts decoding using the
421 given byte order:
422
423 *byteorder == -1: little endian
424 *byteorder == 0: native order
425 *byteorder == 1: big endian
426
427 and then switches according to all BOM marks it finds in the input
428 data. BOM marks are not copied into the resulting Unicode string.
429 After completion, *byteorder is set to the current byte order at
430 the end of input data.
431
432 If byteorder is NULL, the codec starts in native order mode.
433
434*/
435
436extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
437 const char *string, /* UTF-16 encoded string */
438 int length, /* size of string */
439 const char *errors, /* error handling */
440 int *byteorder /* pointer to byteorder to use
441 0=native;-1=LE,1=BE; updated on
442 exit */
443 );
444
445/* Returns a Python string using the UTF-16 encoding in native byte
446 order. The string always starts with a BOM mark. */
447
448extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
449 PyObject *unicode /* Unicode object */
450 );
451
452/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000453 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000454
455 If byteorder is not 0, output is written according to the following
456 byte order:
457
458 byteorder == -1: little endian
459 byteorder == 0: native byte order (writes a BOM mark)
460 byteorder == 1: big endian
461
462 If byteorder is 0, the output string will always start with the
463 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
464 prepended.
465
466 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
467 UCS-2. This trick makes it possible to add full UTF-16 capabilities
468 at a later point without comprimising the APIs.
469
470*/
471
472extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
473 const Py_UNICODE *data, /* Unicode char buffer */
474 int length, /* number of Py_UNICODE chars to encode */
475 const char *errors, /* error handling */
476 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
477 );
478
479/* --- Unicode-Escape Codecs ---------------------------------------------- */
480
481extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
482 const char *string, /* Unicode-Escape encoded string */
483 int length, /* size of string */
484 const char *errors /* error handling */
485 );
486
487extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
488 PyObject *unicode /* Unicode object */
489 );
490
491extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
492 const Py_UNICODE *data, /* Unicode char buffer */
493 int length /* Number of Py_UNICODE chars to encode */
494 );
495
496/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
497
498extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
499 const char *string, /* Raw-Unicode-Escape encoded string */
500 int length, /* size of string */
501 const char *errors /* error handling */
502 );
503
504extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
505 PyObject *unicode /* Unicode object */
506 );
507
508extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
509 const Py_UNICODE *data, /* Unicode char buffer */
510 int length /* Number of Py_UNICODE chars to encode */
511 );
512
513/* --- Latin-1 Codecs -----------------------------------------------------
514
515 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
516
517*/
518
519extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
520 const char *string, /* Latin-1 encoded string */
521 int length, /* size of string */
522 const char *errors /* error handling */
523 );
524
525extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
526 PyObject *unicode /* Unicode object */
527 );
528
529extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
530 const Py_UNICODE *data, /* Unicode char buffer */
531 int length, /* Number of Py_UNICODE chars to encode */
532 const char *errors /* error handling */
533 );
534
535/* --- ASCII Codecs -------------------------------------------------------
536
537 Only 7-bit ASCII data is excepted. All other codes generate errors.
538
539*/
540
541extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
542 const char *string, /* ASCII encoded string */
543 int length, /* size of string */
544 const char *errors /* error handling */
545 );
546
547extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
548 PyObject *unicode /* Unicode object */
549 );
550
551extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
552 const Py_UNICODE *data, /* Unicode char buffer */
553 int length, /* Number of Py_UNICODE chars to encode */
554 const char *errors /* error handling */
555 );
556
557/* --- Character Map Codecs -----------------------------------------------
558
559 This codec uses mappings to encode and decode characters.
560
561 Decoding mappings must map single string characters to single
562 Unicode characters, integers (which are then interpreted as Unicode
563 ordinals) or None (meaning "undefined mapping" and causing an
564 error).
565
566 Encoding mappings must map single Unicode characters to single
567 string characters, integers (which are then interpreted as Latin-1
568 ordinals) or None (meaning "undefined mapping" and causing an
569 error).
570
571 If a character lookup fails with a LookupError, the character is
572 copied as-is meaning that its ordinal value will be interpreted as
573 Unicode or Latin-1 ordinal resp. Because of this mappings only need
574 to contain those mappings which map characters to different code
575 points.
576
577*/
578
579extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
580 const char *string, /* Encoded string */
581 int length, /* size of string */
582 PyObject *mapping, /* character mapping
583 (char ordinal -> unicode ordinal) */
584 const char *errors /* error handling */
585 );
586
587extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
588 PyObject *unicode, /* Unicode object */
589 PyObject *mapping /* character mapping
590 (unicode ordinal -> char ordinal) */
591 );
592
593extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
594 const Py_UNICODE *data, /* Unicode char buffer */
595 int length, /* Number of Py_UNICODE chars to encode */
596 PyObject *mapping, /* character mapping
597 (unicode ordinal -> char ordinal) */
598 const char *errors /* error handling */
599 );
600
601/* Translate a Py_UNICODE buffer of the given length by applying a
602 character mapping table to it and return the resulting Unicode
603 object.
604
605 The mapping table must map Unicode ordinal integers to Unicode
606 ordinal integers or None (causing deletion of the character).
607
608 Mapping tables may be dictionaries or sequences. Unmapped character
609 ordinals (ones which cause a LookupError) are left untouched and
610 are copied as-is.
611
612*/
613
614extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
615 const Py_UNICODE *data, /* Unicode char buffer */
616 int length, /* Number of Py_UNICODE chars to encode */
617 PyObject *table, /* Translate table */
618 const char *errors /* error handling */
619 );
620
Guido van Rossumefec1152000-03-28 02:01:15 +0000621#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000622
Guido van Rossumefec1152000-03-28 02:01:15 +0000623/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000624
Guido van Rossumefec1152000-03-28 02:01:15 +0000625extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
626 const char *string, /* MBCS encoded string */
627 int length, /* size of string */
628 const char *errors /* error handling */
629 );
630
631extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
632 PyObject *unicode /* Unicode object */
633 );
634
635extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
636 const Py_UNICODE *data, /* Unicode char buffer */
637 int length, /* Number of Py_UNICODE chars to encode */
638 const char *errors /* error handling */
639 );
640
Guido van Rossumefec1152000-03-28 02:01:15 +0000641#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000642
Guido van Rossum9e896b32000-04-05 20:11:21 +0000643/* --- Decimal Encoder ---------------------------------------------------- */
644
645/* Takes a Unicode string holding a decimal value and writes it into
646 an output buffer using standard ASCII digit codes.
647
648 The output buffer has to provide at least length+1 bytes of storage
649 area. The output string is 0-terminated.
650
651 The encoder converts whitespace to ' ', decimal characters to their
652 corresponding ASCII digit and all other Latin-1 characters except
653 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
654 are treated as errors. This includes embedded NULL bytes.
655
656 Error handling is defined by the errors argument:
657
658 NULL or "strict": raise a ValueError
659 "ignore": ignore the wrong characters (these are not copied to the
660 output buffer)
661 "replace": replaces illegal characters with '?'
662
663 Returns 0 on success, -1 on failure.
664
665*/
666
667extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
668 Py_UNICODE *s, /* Unicode buffer */
669 int length, /* Number of Py_UNICODE chars to encode */
670 char *output, /* Output buffer; must have size >= length */
671 const char *errors /* error handling */
672 );
673
Guido van Rossumd8225182000-03-10 22:33:05 +0000674/* --- Methods & Slots ----------------------------------------------------
675
676 These are capable of handling Unicode objects and strings on input
677 (we refer to them as strings in the descriptions) and return
678 Unicode objects or integers as apporpriate. */
679
680/* Concat two strings giving a new Unicode string. */
681
682extern DL_IMPORT(PyObject*) PyUnicode_Concat(
683 PyObject *left, /* Left string */
684 PyObject *right /* Right string */
685 );
686
687/* Split a string giving a list of Unicode strings.
688
689 If sep is NULL, splitting will be done at all whitespace
690 substrings. Otherwise, splits occur at the given separator.
691
692 At most maxsplit splits will be done. If negative, no limit is set.
693
694 Separators are not included in the resulting list.
695
696*/
697
698extern DL_IMPORT(PyObject*) PyUnicode_Split(
699 PyObject *s, /* String to split */
700 PyObject *sep, /* String separator */
701 int maxsplit /* Maxsplit count */
702 );
703
704/* Dito, but split at line breaks.
705
706 CRLF is considered to be one line break. Line breaks are not
707 included in the resulting list. */
708
709extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
710 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000711 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000712 );
713
714/* Translate a string by applying a character mapping table to it and
715 return the resulting Unicode object.
716
717 The mapping table must map Unicode ordinal integers to Unicode
718 ordinal integers or None (causing deletion of the character).
719
720 Mapping tables may be dictionaries or sequences. Unmapped character
721 ordinals (ones which cause a LookupError) are left untouched and
722 are copied as-is.
723
724*/
725
726extern DL_IMPORT(PyObject *) PyUnicode_Translate(
727 PyObject *str, /* String */
728 PyObject *table, /* Translate table */
729 const char *errors /* error handling */
730 );
731
732/* Join a sequence of strings using the given separator and return
733 the resulting Unicode string. */
734
735extern DL_IMPORT(PyObject*) PyUnicode_Join(
736 PyObject *separator, /* Separator string */
737 PyObject *seq /* Sequence object */
738 );
739
740/* Return 1 if substr matches str[start:end] at the given tail end, 0
741 otherwise. */
742
743extern DL_IMPORT(int) PyUnicode_Tailmatch(
744 PyObject *str, /* String */
745 PyObject *substr, /* Prefix or Suffix string */
746 int start, /* Start index */
747 int end, /* Stop index */
748 int direction /* Tail end: -1 prefix, +1 suffix */
749 );
750
751/* Return the first position of substr in str[start:end] using the
752 given search direction or -1 if not found. */
753
754extern DL_IMPORT(int) PyUnicode_Find(
755 PyObject *str, /* String */
756 PyObject *substr, /* Substring to find */
757 int start, /* Start index */
758 int end, /* Stop index */
759 int direction /* Find direction: +1 forward, -1 backward */
760 );
761
Barry Warsaw51ac5802000-03-20 16:36:48 +0000762/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000763
764extern DL_IMPORT(int) PyUnicode_Count(
765 PyObject *str, /* String */
766 PyObject *substr, /* Substring to count */
767 int start, /* Start index */
768 int end /* Stop index */
769 );
770
Barry Warsaw51ac5802000-03-20 16:36:48 +0000771/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000772 and return the resulting Unicode object. */
773
774extern DL_IMPORT(PyObject *) PyUnicode_Replace(
775 PyObject *str, /* String */
776 PyObject *substr, /* Substring to find */
777 PyObject *replstr, /* Substring to replace */
778 int maxcount /* Max. number of replacements to apply;
779 -1 = all */
780 );
781
782/* Compare two strings and return -1, 0, 1 for less than, equal,
783 greater than resp. */
784
785extern DL_IMPORT(int) PyUnicode_Compare(
786 PyObject *left, /* Left string */
787 PyObject *right /* Right string */
788 );
789
790/* Apply a argument tuple or dictionar to a format string and return
791 the resulting Unicode string. */
792
793extern DL_IMPORT(PyObject *) PyUnicode_Format(
794 PyObject *format, /* Format string */
795 PyObject *args /* Argument tuple or dictionary */
796 );
797
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000798/* Checks whether element is contained in container and return 1/0
799 accordingly.
800
801 element has to coerce to an one element Unicode string. -1 is
802 returned in case of an error. */
803
804extern DL_IMPORT(int) PyUnicode_Contains(
805 PyObject *container, /* Container string */
806 PyObject *element /* Element string */
807 );
808
Guido van Rossumd8225182000-03-10 22:33:05 +0000809/* === Characters Type APIs =============================================== */
810
811/* These should not be used directly. Use the Py_UNICODE_IS* and
812 Py_UNICODE_TO* macros instead.
813
814 These APIs are implemented in Objects/unicodectype.c.
815
816*/
817
818extern DL_IMPORT(int) _PyUnicode_IsLowercase(
819 register const Py_UNICODE ch /* Unicode character */
820 );
821
822extern DL_IMPORT(int) _PyUnicode_IsUppercase(
823 register const Py_UNICODE ch /* Unicode character */
824 );
825
826extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
827 register const Py_UNICODE ch /* Unicode character */
828 );
829
830extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
831 register const Py_UNICODE ch /* Unicode character */
832 );
833
834extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
835 register const Py_UNICODE ch /* Unicode character */
836 );
837
838extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
839 register const Py_UNICODE ch /* Unicode character */
840 );
841
842extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
843 register const Py_UNICODE ch /* Unicode character */
844 );
845
846extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
847 register const Py_UNICODE ch /* Unicode character */
848 );
849
850extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
851 register const Py_UNICODE ch /* Unicode character */
852 );
853
854extern DL_IMPORT(int) _PyUnicode_ToDigit(
855 register const Py_UNICODE ch /* Unicode character */
856 );
857
858extern DL_IMPORT(double) _PyUnicode_ToNumeric(
859 register const Py_UNICODE ch /* Unicode character */
860 );
861
862extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
863 register const Py_UNICODE ch /* Unicode character */
864 );
865
866extern DL_IMPORT(int) _PyUnicode_IsDigit(
867 register const Py_UNICODE ch /* Unicode character */
868 );
869
870extern DL_IMPORT(int) _PyUnicode_IsNumeric(
871 register const Py_UNICODE ch /* Unicode character */
872 );
873
874#ifdef __cplusplus
875}
876#endif
877#endif /* !Py_UNICODEOBJECT_H */