blob: 6d0435a08ece71a6a0b6147490f56b9c104b150c [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114 configure Python using --with-ctype-functions. This reduces the
115 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#else
141
142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#endif
162
163#define Py_UNICODE_COPY(target, source, length)\
164 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
165
166#define Py_UNICODE_FILL(target, value, length) do\
167 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
168 while (0)
169
170#define Py_UNICODE_MATCH(string, offset, substring)\
171 (!memcmp((string)->str + (offset), (substring)->str,\
172 (substring)->length*sizeof(Py_UNICODE)))
173
Barry Warsaw51ac5802000-03-20 16:36:48 +0000174#ifdef __cplusplus
175extern "C" {
176#endif
177
Guido van Rossumd8225182000-03-10 22:33:05 +0000178/* --- Unicode Type ------------------------------------------------------- */
179
180typedef struct {
181 PyObject_HEAD
182 int length; /* Length of raw Unicode data in buffer */
183 Py_UNICODE *str; /* Raw Unicode buffer */
184 long hash; /* Hash value; -1 if not set */
185 PyObject *utf8str; /* UTF-8 encoded version as Python string,
186 or NULL */
187} PyUnicodeObject;
188
189extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
190
191#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
192
193/* Fast access macros */
194#define PyUnicode_GET_SIZE(op) \
195 (((PyUnicodeObject *)(op))->length)
196#define PyUnicode_GET_DATA_SIZE(op) \
197 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
198#define PyUnicode_AS_UNICODE(op) \
199 (((PyUnicodeObject *)(op))->str)
200#define PyUnicode_AS_DATA(op) \
201 ((const char *)((PyUnicodeObject *)(op))->str)
202
203/* --- Constants ---------------------------------------------------------- */
204
205/* This Unicode character will be used as replacement character during
206 decoding if the errors argument is set to "replace". Note: the
207 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
208 Unicode 3.0. */
209
210#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
211
212/* === Public API ========================================================= */
213
214/* --- Plain Py_UNICODE --------------------------------------------------- */
215
216/* Create a Unicode Object from the Py_UNICODE buffer u of the given
217 size. u may be NULL which causes the contents to be undefined. It
218 is the user's responsibility to fill in the needed data.
219
220 The buffer is copied into the new object. */
221
222extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
223 const Py_UNICODE *u, /* Unicode buffer */
224 int size /* size of buffer */
225 );
226
227/* Return a read-only pointer to the Unicode object's internal
228 Py_UNICODE buffer. */
229
230extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
231 PyObject *unicode /* Unicode object */
232 );
233
234/* Get the length of the Unicode object. */
235
236extern DL_IMPORT(int) PyUnicode_GetSize(
237 PyObject *unicode /* Unicode object */
238 );
239
Guido van Rossum52c23592000-04-10 13:41:41 +0000240/* Resize an already allocated Unicode object to the new size length.
241
242 *unicode is modified to point to the new (resized) object and 0
243 returned on success.
244
245 This API may only be called by the function which also called the
246 Unicode constructor. The refcount on the object must be 1. Otherwise,
247 an error is returned.
248
249 Error handling is implemented as follows: an exception is set, -1
250 is returned and *unicode left untouched.
251
252*/
253
254extern DL_IMPORT(int) PyUnicode_Resize(
255 PyObject **unicode, /* Pointer to the Unicode object */
256 int length /* New length */
257 );
258
Guido van Rossumd8225182000-03-10 22:33:05 +0000259/* Coerce obj to an Unicode object and return a reference with
260 *incremented* refcount.
261
262 Coercion is done in the following way:
263
264 1. Unicode objects are passed back as-is with incremented
265 refcount.
266
267 2. String and other char buffer compatible objects are decoded
268 under the assumptions that they contain UTF-8 data. Decoding
269 is done in "strict" mode.
270
271 3. All other objects raise an exception.
272
273 The API returns NULL in case of an error. The caller is responsible
274 for decref'ing the returned objects.
275
276*/
277
278extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
279 register PyObject *obj /* Object */
280 );
281
282/* --- wchar_t support for platforms which support it --------------------- */
283
284#ifdef HAVE_WCHAR_H
285
286/* Create a Unicode Object from the whcar_t buffer w of the given
287 size.
288
289 The buffer is copied into the new object. */
290
291extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
292 register const wchar_t *w, /* wchar_t buffer */
293 int size /* size of buffer */
294 );
295
296/* Copies the Unicode Object contents into the whcar_t buffer w. At
297 most size wchar_t characters are copied.
298
299 Returns the number of wchar_t characters copied or -1 in case of an
300 error. */
301
302extern DL_IMPORT(int) PyUnicode_AsWideChar(
303 PyUnicodeObject *unicode, /* Unicode object */
304 register wchar_t *w, /* wchar_t buffer */
305 int size /* size of buffer */
306 );
307
308#endif
309
310/* === Builtin Codecs =====================================================
311
312 Many of these APIs take two arguments encoding and errors. These
313 parameters encoding and errors have the same semantics as the ones
314 of the builtin unicode() API.
315
316 Setting encoding to NULL causes the default encoding to be used
317 which is UTF-8.
318
319 Error handling is set by errors which may also be set to NULL
320 meaning to use the default handling defined for the codec. Default
321 error handling for all builtin codecs is "strict" (ValueErrors are
322 raised).
323
324 The codecs all use a similar interface. Only deviation from the
325 generic ones are documented.
326
327*/
328
329/* --- Generic Codecs ----------------------------------------------------- */
330
331/* Create a Unicode object by decoding the encoded string s of the
332 given size. */
333
334extern DL_IMPORT(PyObject*) PyUnicode_Decode(
335 const char *s, /* encoded string */
336 int size, /* size of buffer */
337 const char *encoding, /* encoding */
338 const char *errors /* error handling */
339 );
340
341/* Encodes a Py_UNICODE buffer of the given size and returns a
342 Python string object. */
343
344extern DL_IMPORT(PyObject*) PyUnicode_Encode(
345 const Py_UNICODE *s, /* Unicode char buffer */
346 int size, /* number of Py_UNICODE chars to encode */
347 const char *encoding, /* encoding */
348 const char *errors /* error handling */
349 );
350
351/* Encodes a Unicode object and returns the result as Python string
352 object. */
353
354extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
355 PyObject *unicode, /* Unicode object */
356 const char *encoding, /* encoding */
357 const char *errors /* error handling */
358 );
359
360/* --- UTF-8 Codecs ------------------------------------------------------- */
361
362extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
363 const char *string, /* UTF-8 encoded string */
364 int length, /* size of string */
365 const char *errors /* error handling */
366 );
367
368extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
369 PyObject *unicode /* Unicode object */
370 );
371
372extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
373 const Py_UNICODE *data, /* Unicode char buffer */
374 int length, /* number of Py_UNICODE chars to encode */
375 const char *errors /* error handling */
376 );
377
378/* --- UTF-16 Codecs ------------------------------------------------------ */
379
Guido van Rossum9e896b32000-04-05 20:11:21 +0000380/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000381 the corresponding Unicode object.
382
383 errors (if non-NULL) defines the error handling. It defaults
384 to "strict".
385
386 If byteorder is non-NULL, the decoder starts decoding using the
387 given byte order:
388
389 *byteorder == -1: little endian
390 *byteorder == 0: native order
391 *byteorder == 1: big endian
392
393 and then switches according to all BOM marks it finds in the input
394 data. BOM marks are not copied into the resulting Unicode string.
395 After completion, *byteorder is set to the current byte order at
396 the end of input data.
397
398 If byteorder is NULL, the codec starts in native order mode.
399
400*/
401
402extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
403 const char *string, /* UTF-16 encoded string */
404 int length, /* size of string */
405 const char *errors, /* error handling */
406 int *byteorder /* pointer to byteorder to use
407 0=native;-1=LE,1=BE; updated on
408 exit */
409 );
410
411/* Returns a Python string using the UTF-16 encoding in native byte
412 order. The string always starts with a BOM mark. */
413
414extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
415 PyObject *unicode /* Unicode object */
416 );
417
418/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000419 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000420
421 If byteorder is not 0, output is written according to the following
422 byte order:
423
424 byteorder == -1: little endian
425 byteorder == 0: native byte order (writes a BOM mark)
426 byteorder == 1: big endian
427
428 If byteorder is 0, the output string will always start with the
429 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
430 prepended.
431
432 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
433 UCS-2. This trick makes it possible to add full UTF-16 capabilities
434 at a later point without comprimising the APIs.
435
436*/
437
438extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
439 const Py_UNICODE *data, /* Unicode char buffer */
440 int length, /* number of Py_UNICODE chars to encode */
441 const char *errors, /* error handling */
442 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
443 );
444
445/* --- Unicode-Escape Codecs ---------------------------------------------- */
446
447extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
448 const char *string, /* Unicode-Escape encoded string */
449 int length, /* size of string */
450 const char *errors /* error handling */
451 );
452
453extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
454 PyObject *unicode /* Unicode object */
455 );
456
457extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
458 const Py_UNICODE *data, /* Unicode char buffer */
459 int length /* Number of Py_UNICODE chars to encode */
460 );
461
462/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
463
464extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
465 const char *string, /* Raw-Unicode-Escape encoded string */
466 int length, /* size of string */
467 const char *errors /* error handling */
468 );
469
470extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
471 PyObject *unicode /* Unicode object */
472 );
473
474extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
475 const Py_UNICODE *data, /* Unicode char buffer */
476 int length /* Number of Py_UNICODE chars to encode */
477 );
478
479/* --- Latin-1 Codecs -----------------------------------------------------
480
481 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
482
483*/
484
485extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
486 const char *string, /* Latin-1 encoded string */
487 int length, /* size of string */
488 const char *errors /* error handling */
489 );
490
491extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
492 PyObject *unicode /* Unicode object */
493 );
494
495extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
496 const Py_UNICODE *data, /* Unicode char buffer */
497 int length, /* Number of Py_UNICODE chars to encode */
498 const char *errors /* error handling */
499 );
500
501/* --- ASCII Codecs -------------------------------------------------------
502
503 Only 7-bit ASCII data is excepted. All other codes generate errors.
504
505*/
506
507extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
508 const char *string, /* ASCII encoded string */
509 int length, /* size of string */
510 const char *errors /* error handling */
511 );
512
513extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
514 PyObject *unicode /* Unicode object */
515 );
516
517extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
518 const Py_UNICODE *data, /* Unicode char buffer */
519 int length, /* Number of Py_UNICODE chars to encode */
520 const char *errors /* error handling */
521 );
522
523/* --- Character Map Codecs -----------------------------------------------
524
525 This codec uses mappings to encode and decode characters.
526
527 Decoding mappings must map single string characters to single
528 Unicode characters, integers (which are then interpreted as Unicode
529 ordinals) or None (meaning "undefined mapping" and causing an
530 error).
531
532 Encoding mappings must map single Unicode characters to single
533 string characters, integers (which are then interpreted as Latin-1
534 ordinals) or None (meaning "undefined mapping" and causing an
535 error).
536
537 If a character lookup fails with a LookupError, the character is
538 copied as-is meaning that its ordinal value will be interpreted as
539 Unicode or Latin-1 ordinal resp. Because of this mappings only need
540 to contain those mappings which map characters to different code
541 points.
542
543*/
544
545extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
546 const char *string, /* Encoded string */
547 int length, /* size of string */
548 PyObject *mapping, /* character mapping
549 (char ordinal -> unicode ordinal) */
550 const char *errors /* error handling */
551 );
552
553extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
554 PyObject *unicode, /* Unicode object */
555 PyObject *mapping /* character mapping
556 (unicode ordinal -> char ordinal) */
557 );
558
559extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
560 const Py_UNICODE *data, /* Unicode char buffer */
561 int length, /* Number of Py_UNICODE chars to encode */
562 PyObject *mapping, /* character mapping
563 (unicode ordinal -> char ordinal) */
564 const char *errors /* error handling */
565 );
566
567/* Translate a Py_UNICODE buffer of the given length by applying a
568 character mapping table to it and return the resulting Unicode
569 object.
570
571 The mapping table must map Unicode ordinal integers to Unicode
572 ordinal integers or None (causing deletion of the character).
573
574 Mapping tables may be dictionaries or sequences. Unmapped character
575 ordinals (ones which cause a LookupError) are left untouched and
576 are copied as-is.
577
578*/
579
580extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
581 const Py_UNICODE *data, /* Unicode char buffer */
582 int length, /* Number of Py_UNICODE chars to encode */
583 PyObject *table, /* Translate table */
584 const char *errors /* error handling */
585 );
586
Guido van Rossumefec1152000-03-28 02:01:15 +0000587#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000588
Guido van Rossumefec1152000-03-28 02:01:15 +0000589/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000590
Guido van Rossumefec1152000-03-28 02:01:15 +0000591extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
592 const char *string, /* MBCS encoded string */
593 int length, /* size of string */
594 const char *errors /* error handling */
595 );
596
597extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
598 PyObject *unicode /* Unicode object */
599 );
600
601extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
602 const Py_UNICODE *data, /* Unicode char buffer */
603 int length, /* Number of Py_UNICODE chars to encode */
604 const char *errors /* error handling */
605 );
606
Guido van Rossumefec1152000-03-28 02:01:15 +0000607#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000608
Guido van Rossum9e896b32000-04-05 20:11:21 +0000609/* --- Decimal Encoder ---------------------------------------------------- */
610
611/* Takes a Unicode string holding a decimal value and writes it into
612 an output buffer using standard ASCII digit codes.
613
614 The output buffer has to provide at least length+1 bytes of storage
615 area. The output string is 0-terminated.
616
617 The encoder converts whitespace to ' ', decimal characters to their
618 corresponding ASCII digit and all other Latin-1 characters except
619 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
620 are treated as errors. This includes embedded NULL bytes.
621
622 Error handling is defined by the errors argument:
623
624 NULL or "strict": raise a ValueError
625 "ignore": ignore the wrong characters (these are not copied to the
626 output buffer)
627 "replace": replaces illegal characters with '?'
628
629 Returns 0 on success, -1 on failure.
630
631*/
632
633extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
634 Py_UNICODE *s, /* Unicode buffer */
635 int length, /* Number of Py_UNICODE chars to encode */
636 char *output, /* Output buffer; must have size >= length */
637 const char *errors /* error handling */
638 );
639
Guido van Rossumd8225182000-03-10 22:33:05 +0000640/* --- Methods & Slots ----------------------------------------------------
641
642 These are capable of handling Unicode objects and strings on input
643 (we refer to them as strings in the descriptions) and return
644 Unicode objects or integers as apporpriate. */
645
646/* Concat two strings giving a new Unicode string. */
647
648extern DL_IMPORT(PyObject*) PyUnicode_Concat(
649 PyObject *left, /* Left string */
650 PyObject *right /* Right string */
651 );
652
653/* Split a string giving a list of Unicode strings.
654
655 If sep is NULL, splitting will be done at all whitespace
656 substrings. Otherwise, splits occur at the given separator.
657
658 At most maxsplit splits will be done. If negative, no limit is set.
659
660 Separators are not included in the resulting list.
661
662*/
663
664extern DL_IMPORT(PyObject*) PyUnicode_Split(
665 PyObject *s, /* String to split */
666 PyObject *sep, /* String separator */
667 int maxsplit /* Maxsplit count */
668 );
669
670/* Dito, but split at line breaks.
671
672 CRLF is considered to be one line break. Line breaks are not
673 included in the resulting list. */
674
675extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
676 PyObject *s, /* String to split */
677 int maxsplit /* Maxsplit count */
678 );
679
680/* Translate a string by applying a character mapping table to it and
681 return the resulting Unicode object.
682
683 The mapping table must map Unicode ordinal integers to Unicode
684 ordinal integers or None (causing deletion of the character).
685
686 Mapping tables may be dictionaries or sequences. Unmapped character
687 ordinals (ones which cause a LookupError) are left untouched and
688 are copied as-is.
689
690*/
691
692extern DL_IMPORT(PyObject *) PyUnicode_Translate(
693 PyObject *str, /* String */
694 PyObject *table, /* Translate table */
695 const char *errors /* error handling */
696 );
697
698/* Join a sequence of strings using the given separator and return
699 the resulting Unicode string. */
700
701extern DL_IMPORT(PyObject*) PyUnicode_Join(
702 PyObject *separator, /* Separator string */
703 PyObject *seq /* Sequence object */
704 );
705
706/* Return 1 if substr matches str[start:end] at the given tail end, 0
707 otherwise. */
708
709extern DL_IMPORT(int) PyUnicode_Tailmatch(
710 PyObject *str, /* String */
711 PyObject *substr, /* Prefix or Suffix string */
712 int start, /* Start index */
713 int end, /* Stop index */
714 int direction /* Tail end: -1 prefix, +1 suffix */
715 );
716
717/* Return the first position of substr in str[start:end] using the
718 given search direction or -1 if not found. */
719
720extern DL_IMPORT(int) PyUnicode_Find(
721 PyObject *str, /* String */
722 PyObject *substr, /* Substring to find */
723 int start, /* Start index */
724 int end, /* Stop index */
725 int direction /* Find direction: +1 forward, -1 backward */
726 );
727
Barry Warsaw51ac5802000-03-20 16:36:48 +0000728/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000729
730extern DL_IMPORT(int) PyUnicode_Count(
731 PyObject *str, /* String */
732 PyObject *substr, /* Substring to count */
733 int start, /* Start index */
734 int end /* Stop index */
735 );
736
Barry Warsaw51ac5802000-03-20 16:36:48 +0000737/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000738 and return the resulting Unicode object. */
739
740extern DL_IMPORT(PyObject *) PyUnicode_Replace(
741 PyObject *str, /* String */
742 PyObject *substr, /* Substring to find */
743 PyObject *replstr, /* Substring to replace */
744 int maxcount /* Max. number of replacements to apply;
745 -1 = all */
746 );
747
748/* Compare two strings and return -1, 0, 1 for less than, equal,
749 greater than resp. */
750
751extern DL_IMPORT(int) PyUnicode_Compare(
752 PyObject *left, /* Left string */
753 PyObject *right /* Right string */
754 );
755
756/* Apply a argument tuple or dictionar to a format string and return
757 the resulting Unicode string. */
758
759extern DL_IMPORT(PyObject *) PyUnicode_Format(
760 PyObject *format, /* Format string */
761 PyObject *args /* Argument tuple or dictionary */
762 );
763
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000764/* Checks whether element is contained in container and return 1/0
765 accordingly.
766
767 element has to coerce to an one element Unicode string. -1 is
768 returned in case of an error. */
769
770extern DL_IMPORT(int) PyUnicode_Contains(
771 PyObject *container, /* Container string */
772 PyObject *element /* Element string */
773 );
774
Guido van Rossumd8225182000-03-10 22:33:05 +0000775/* === Characters Type APIs =============================================== */
776
777/* These should not be used directly. Use the Py_UNICODE_IS* and
778 Py_UNICODE_TO* macros instead.
779
780 These APIs are implemented in Objects/unicodectype.c.
781
782*/
783
784extern DL_IMPORT(int) _PyUnicode_IsLowercase(
785 register const Py_UNICODE ch /* Unicode character */
786 );
787
788extern DL_IMPORT(int) _PyUnicode_IsUppercase(
789 register const Py_UNICODE ch /* Unicode character */
790 );
791
792extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
793 register const Py_UNICODE ch /* Unicode character */
794 );
795
796extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
797 register const Py_UNICODE ch /* Unicode character */
798 );
799
800extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
801 register const Py_UNICODE ch /* Unicode character */
802 );
803
804extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
805 register const Py_UNICODE ch /* Unicode character */
806 );
807
808extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
809 register const Py_UNICODE ch /* Unicode character */
810 );
811
812extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
813 register const Py_UNICODE ch /* Unicode character */
814 );
815
816extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
817 register const Py_UNICODE ch /* Unicode character */
818 );
819
820extern DL_IMPORT(int) _PyUnicode_ToDigit(
821 register const Py_UNICODE ch /* Unicode character */
822 );
823
824extern DL_IMPORT(double) _PyUnicode_ToNumeric(
825 register const Py_UNICODE ch /* Unicode character */
826 );
827
828extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
829 register const Py_UNICODE ch /* Unicode character */
830 );
831
832extern DL_IMPORT(int) _PyUnicode_IsDigit(
833 register const Py_UNICODE ch /* Unicode character */
834 );
835
836extern DL_IMPORT(int) _PyUnicode_IsNumeric(
837 register const Py_UNICODE ch /* Unicode character */
838 );
839
840#ifdef __cplusplus
841}
842#endif
843#endif /* !Py_UNICODEOBJECT_H */