blob: bed3b7b8a1a176f73c3f293019baeda6bf1ffa81 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114 configure Python using --with-ctype-functions. This reduces the
115 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#else
141
142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#endif
162
163#define Py_UNICODE_COPY(target, source, length)\
164 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
165
166#define Py_UNICODE_FILL(target, value, length) do\
167 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
168 while (0)
169
170#define Py_UNICODE_MATCH(string, offset, substring)\
171 (!memcmp((string)->str + (offset), (substring)->str,\
172 (substring)->length*sizeof(Py_UNICODE)))
173
Barry Warsaw51ac5802000-03-20 16:36:48 +0000174#ifdef __cplusplus
175extern "C" {
176#endif
177
Guido van Rossumd8225182000-03-10 22:33:05 +0000178/* --- Unicode Type ------------------------------------------------------- */
179
180typedef struct {
181 PyObject_HEAD
182 int length; /* Length of raw Unicode data in buffer */
183 Py_UNICODE *str; /* Raw Unicode buffer */
184 long hash; /* Hash value; -1 if not set */
185 PyObject *utf8str; /* UTF-8 encoded version as Python string,
186 or NULL */
187} PyUnicodeObject;
188
189extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
190
191#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
192
193/* Fast access macros */
194#define PyUnicode_GET_SIZE(op) \
195 (((PyUnicodeObject *)(op))->length)
196#define PyUnicode_GET_DATA_SIZE(op) \
197 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
198#define PyUnicode_AS_UNICODE(op) \
199 (((PyUnicodeObject *)(op))->str)
200#define PyUnicode_AS_DATA(op) \
201 ((const char *)((PyUnicodeObject *)(op))->str)
202
203/* --- Constants ---------------------------------------------------------- */
204
205/* This Unicode character will be used as replacement character during
206 decoding if the errors argument is set to "replace". Note: the
207 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
208 Unicode 3.0. */
209
210#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
211
212/* === Public API ========================================================= */
213
214/* --- Plain Py_UNICODE --------------------------------------------------- */
215
216/* Create a Unicode Object from the Py_UNICODE buffer u of the given
217 size. u may be NULL which causes the contents to be undefined. It
218 is the user's responsibility to fill in the needed data.
219
220 The buffer is copied into the new object. */
221
222extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
223 const Py_UNICODE *u, /* Unicode buffer */
224 int size /* size of buffer */
225 );
226
227/* Return a read-only pointer to the Unicode object's internal
228 Py_UNICODE buffer. */
229
230extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
231 PyObject *unicode /* Unicode object */
232 );
233
234/* Get the length of the Unicode object. */
235
236extern DL_IMPORT(int) PyUnicode_GetSize(
237 PyObject *unicode /* Unicode object */
238 );
239
Guido van Rossum52c23592000-04-10 13:41:41 +0000240/* Resize an already allocated Unicode object to the new size length.
241
242 *unicode is modified to point to the new (resized) object and 0
243 returned on success.
244
245 This API may only be called by the function which also called the
246 Unicode constructor. The refcount on the object must be 1. Otherwise,
247 an error is returned.
248
249 Error handling is implemented as follows: an exception is set, -1
250 is returned and *unicode left untouched.
251
252*/
253
254extern DL_IMPORT(int) PyUnicode_Resize(
255 PyObject **unicode, /* Pointer to the Unicode object */
256 int length /* New length */
257 );
258
Guido van Rossumd8225182000-03-10 22:33:05 +0000259/* Coerce obj to an Unicode object and return a reference with
260 *incremented* refcount.
261
262 Coercion is done in the following way:
263
264 1. Unicode objects are passed back as-is with incremented
265 refcount.
266
267 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000268 under the assumptions that they contain data using the current
269 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000270
271 3. All other objects raise an exception.
272
273 The API returns NULL in case of an error. The caller is responsible
274 for decref'ing the returned objects.
275
276*/
277
278extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
279 register PyObject *obj /* Object */
280 );
281
282/* --- wchar_t support for platforms which support it --------------------- */
283
284#ifdef HAVE_WCHAR_H
285
286/* Create a Unicode Object from the whcar_t buffer w of the given
287 size.
288
289 The buffer is copied into the new object. */
290
291extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
292 register const wchar_t *w, /* wchar_t buffer */
293 int size /* size of buffer */
294 );
295
296/* Copies the Unicode Object contents into the whcar_t buffer w. At
297 most size wchar_t characters are copied.
298
299 Returns the number of wchar_t characters copied or -1 in case of an
300 error. */
301
302extern DL_IMPORT(int) PyUnicode_AsWideChar(
303 PyUnicodeObject *unicode, /* Unicode object */
304 register wchar_t *w, /* wchar_t buffer */
305 int size /* size of buffer */
306 );
307
308#endif
309
310/* === Builtin Codecs =====================================================
311
312 Many of these APIs take two arguments encoding and errors. These
313 parameters encoding and errors have the same semantics as the ones
314 of the builtin unicode() API.
315
Fred Drakecb093fe2000-05-09 19:51:53 +0000316 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000317
318 Error handling is set by errors which may also be set to NULL
319 meaning to use the default handling defined for the codec. Default
320 error handling for all builtin codecs is "strict" (ValueErrors are
321 raised).
322
323 The codecs all use a similar interface. Only deviation from the
324 generic ones are documented.
325
326*/
327
Fred Drakecb093fe2000-05-09 19:51:53 +0000328/* --- Manage the default encoding ---------------------------------------- */
329
330/* Returns the currently active default encoding.
331
332 The default encoding is currently implemented as run-time settable
333 process global. This may change in future versions of the
334 interpreter to become a parameter which is managed on a per-thread
335 basis.
336
337 */
338
339extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
340
341/* Sets the currently active default encoding.
342
343 Returns 0 on success, -1 in case of an error.
344
345 */
346
347extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
348 const char *encoding /* Encoding name in standard form */
349 );
350
Guido van Rossumd8225182000-03-10 22:33:05 +0000351/* --- Generic Codecs ----------------------------------------------------- */
352
353/* Create a Unicode object by decoding the encoded string s of the
354 given size. */
355
356extern DL_IMPORT(PyObject*) PyUnicode_Decode(
357 const char *s, /* encoded string */
358 int size, /* size of buffer */
359 const char *encoding, /* encoding */
360 const char *errors /* error handling */
361 );
362
363/* Encodes a Py_UNICODE buffer of the given size and returns a
364 Python string object. */
365
366extern DL_IMPORT(PyObject*) PyUnicode_Encode(
367 const Py_UNICODE *s, /* Unicode char buffer */
368 int size, /* number of Py_UNICODE chars to encode */
369 const char *encoding, /* encoding */
370 const char *errors /* error handling */
371 );
372
373/* Encodes a Unicode object and returns the result as Python string
374 object. */
375
376extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
377 PyObject *unicode, /* Unicode object */
378 const char *encoding, /* encoding */
379 const char *errors /* error handling */
380 );
381
382/* --- UTF-8 Codecs ------------------------------------------------------- */
383
384extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
385 const char *string, /* UTF-8 encoded string */
386 int length, /* size of string */
387 const char *errors /* error handling */
388 );
389
390extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
391 PyObject *unicode /* Unicode object */
392 );
393
394extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
395 const Py_UNICODE *data, /* Unicode char buffer */
396 int length, /* number of Py_UNICODE chars to encode */
397 const char *errors /* error handling */
398 );
399
400/* --- UTF-16 Codecs ------------------------------------------------------ */
401
Guido van Rossum9e896b32000-04-05 20:11:21 +0000402/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000403 the corresponding Unicode object.
404
405 errors (if non-NULL) defines the error handling. It defaults
406 to "strict".
407
408 If byteorder is non-NULL, the decoder starts decoding using the
409 given byte order:
410
411 *byteorder == -1: little endian
412 *byteorder == 0: native order
413 *byteorder == 1: big endian
414
415 and then switches according to all BOM marks it finds in the input
416 data. BOM marks are not copied into the resulting Unicode string.
417 After completion, *byteorder is set to the current byte order at
418 the end of input data.
419
420 If byteorder is NULL, the codec starts in native order mode.
421
422*/
423
424extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
425 const char *string, /* UTF-16 encoded string */
426 int length, /* size of string */
427 const char *errors, /* error handling */
428 int *byteorder /* pointer to byteorder to use
429 0=native;-1=LE,1=BE; updated on
430 exit */
431 );
432
433/* Returns a Python string using the UTF-16 encoding in native byte
434 order. The string always starts with a BOM mark. */
435
436extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
437 PyObject *unicode /* Unicode object */
438 );
439
440/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000441 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000442
443 If byteorder is not 0, output is written according to the following
444 byte order:
445
446 byteorder == -1: little endian
447 byteorder == 0: native byte order (writes a BOM mark)
448 byteorder == 1: big endian
449
450 If byteorder is 0, the output string will always start with the
451 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
452 prepended.
453
454 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
455 UCS-2. This trick makes it possible to add full UTF-16 capabilities
456 at a later point without comprimising the APIs.
457
458*/
459
460extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
461 const Py_UNICODE *data, /* Unicode char buffer */
462 int length, /* number of Py_UNICODE chars to encode */
463 const char *errors, /* error handling */
464 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
465 );
466
467/* --- Unicode-Escape Codecs ---------------------------------------------- */
468
469extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
470 const char *string, /* Unicode-Escape encoded string */
471 int length, /* size of string */
472 const char *errors /* error handling */
473 );
474
475extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
476 PyObject *unicode /* Unicode object */
477 );
478
479extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
480 const Py_UNICODE *data, /* Unicode char buffer */
481 int length /* Number of Py_UNICODE chars to encode */
482 );
483
484/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
485
486extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
487 const char *string, /* Raw-Unicode-Escape encoded string */
488 int length, /* size of string */
489 const char *errors /* error handling */
490 );
491
492extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
493 PyObject *unicode /* Unicode object */
494 );
495
496extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
497 const Py_UNICODE *data, /* Unicode char buffer */
498 int length /* Number of Py_UNICODE chars to encode */
499 );
500
501/* --- Latin-1 Codecs -----------------------------------------------------
502
503 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
504
505*/
506
507extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
508 const char *string, /* Latin-1 encoded string */
509 int length, /* size of string */
510 const char *errors /* error handling */
511 );
512
513extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
514 PyObject *unicode /* Unicode object */
515 );
516
517extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
518 const Py_UNICODE *data, /* Unicode char buffer */
519 int length, /* Number of Py_UNICODE chars to encode */
520 const char *errors /* error handling */
521 );
522
523/* --- ASCII Codecs -------------------------------------------------------
524
525 Only 7-bit ASCII data is excepted. All other codes generate errors.
526
527*/
528
529extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
530 const char *string, /* ASCII encoded string */
531 int length, /* size of string */
532 const char *errors /* error handling */
533 );
534
535extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
536 PyObject *unicode /* Unicode object */
537 );
538
539extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
540 const Py_UNICODE *data, /* Unicode char buffer */
541 int length, /* Number of Py_UNICODE chars to encode */
542 const char *errors /* error handling */
543 );
544
545/* --- Character Map Codecs -----------------------------------------------
546
547 This codec uses mappings to encode and decode characters.
548
549 Decoding mappings must map single string characters to single
550 Unicode characters, integers (which are then interpreted as Unicode
551 ordinals) or None (meaning "undefined mapping" and causing an
552 error).
553
554 Encoding mappings must map single Unicode characters to single
555 string characters, integers (which are then interpreted as Latin-1
556 ordinals) or None (meaning "undefined mapping" and causing an
557 error).
558
559 If a character lookup fails with a LookupError, the character is
560 copied as-is meaning that its ordinal value will be interpreted as
561 Unicode or Latin-1 ordinal resp. Because of this mappings only need
562 to contain those mappings which map characters to different code
563 points.
564
565*/
566
567extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
568 const char *string, /* Encoded string */
569 int length, /* size of string */
570 PyObject *mapping, /* character mapping
571 (char ordinal -> unicode ordinal) */
572 const char *errors /* error handling */
573 );
574
575extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
576 PyObject *unicode, /* Unicode object */
577 PyObject *mapping /* character mapping
578 (unicode ordinal -> char ordinal) */
579 );
580
581extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
582 const Py_UNICODE *data, /* Unicode char buffer */
583 int length, /* Number of Py_UNICODE chars to encode */
584 PyObject *mapping, /* character mapping
585 (unicode ordinal -> char ordinal) */
586 const char *errors /* error handling */
587 );
588
589/* Translate a Py_UNICODE buffer of the given length by applying a
590 character mapping table to it and return the resulting Unicode
591 object.
592
593 The mapping table must map Unicode ordinal integers to Unicode
594 ordinal integers or None (causing deletion of the character).
595
596 Mapping tables may be dictionaries or sequences. Unmapped character
597 ordinals (ones which cause a LookupError) are left untouched and
598 are copied as-is.
599
600*/
601
602extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
603 const Py_UNICODE *data, /* Unicode char buffer */
604 int length, /* Number of Py_UNICODE chars to encode */
605 PyObject *table, /* Translate table */
606 const char *errors /* error handling */
607 );
608
Guido van Rossumefec1152000-03-28 02:01:15 +0000609#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000610
Guido van Rossumefec1152000-03-28 02:01:15 +0000611/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000612
Guido van Rossumefec1152000-03-28 02:01:15 +0000613extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
614 const char *string, /* MBCS encoded string */
615 int length, /* size of string */
616 const char *errors /* error handling */
617 );
618
619extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
620 PyObject *unicode /* Unicode object */
621 );
622
623extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
624 const Py_UNICODE *data, /* Unicode char buffer */
625 int length, /* Number of Py_UNICODE chars to encode */
626 const char *errors /* error handling */
627 );
628
Guido van Rossumefec1152000-03-28 02:01:15 +0000629#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000630
Guido van Rossum9e896b32000-04-05 20:11:21 +0000631/* --- Decimal Encoder ---------------------------------------------------- */
632
633/* Takes a Unicode string holding a decimal value and writes it into
634 an output buffer using standard ASCII digit codes.
635
636 The output buffer has to provide at least length+1 bytes of storage
637 area. The output string is 0-terminated.
638
639 The encoder converts whitespace to ' ', decimal characters to their
640 corresponding ASCII digit and all other Latin-1 characters except
641 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
642 are treated as errors. This includes embedded NULL bytes.
643
644 Error handling is defined by the errors argument:
645
646 NULL or "strict": raise a ValueError
647 "ignore": ignore the wrong characters (these are not copied to the
648 output buffer)
649 "replace": replaces illegal characters with '?'
650
651 Returns 0 on success, -1 on failure.
652
653*/
654
655extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
656 Py_UNICODE *s, /* Unicode buffer */
657 int length, /* Number of Py_UNICODE chars to encode */
658 char *output, /* Output buffer; must have size >= length */
659 const char *errors /* error handling */
660 );
661
Guido van Rossumd8225182000-03-10 22:33:05 +0000662/* --- Methods & Slots ----------------------------------------------------
663
664 These are capable of handling Unicode objects and strings on input
665 (we refer to them as strings in the descriptions) and return
666 Unicode objects or integers as apporpriate. */
667
668/* Concat two strings giving a new Unicode string. */
669
670extern DL_IMPORT(PyObject*) PyUnicode_Concat(
671 PyObject *left, /* Left string */
672 PyObject *right /* Right string */
673 );
674
675/* Split a string giving a list of Unicode strings.
676
677 If sep is NULL, splitting will be done at all whitespace
678 substrings. Otherwise, splits occur at the given separator.
679
680 At most maxsplit splits will be done. If negative, no limit is set.
681
682 Separators are not included in the resulting list.
683
684*/
685
686extern DL_IMPORT(PyObject*) PyUnicode_Split(
687 PyObject *s, /* String to split */
688 PyObject *sep, /* String separator */
689 int maxsplit /* Maxsplit count */
690 );
691
692/* Dito, but split at line breaks.
693
694 CRLF is considered to be one line break. Line breaks are not
695 included in the resulting list. */
696
697extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
698 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000699 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000700 );
701
702/* Translate a string by applying a character mapping table to it and
703 return the resulting Unicode object.
704
705 The mapping table must map Unicode ordinal integers to Unicode
706 ordinal integers or None (causing deletion of the character).
707
708 Mapping tables may be dictionaries or sequences. Unmapped character
709 ordinals (ones which cause a LookupError) are left untouched and
710 are copied as-is.
711
712*/
713
714extern DL_IMPORT(PyObject *) PyUnicode_Translate(
715 PyObject *str, /* String */
716 PyObject *table, /* Translate table */
717 const char *errors /* error handling */
718 );
719
720/* Join a sequence of strings using the given separator and return
721 the resulting Unicode string. */
722
723extern DL_IMPORT(PyObject*) PyUnicode_Join(
724 PyObject *separator, /* Separator string */
725 PyObject *seq /* Sequence object */
726 );
727
728/* Return 1 if substr matches str[start:end] at the given tail end, 0
729 otherwise. */
730
731extern DL_IMPORT(int) PyUnicode_Tailmatch(
732 PyObject *str, /* String */
733 PyObject *substr, /* Prefix or Suffix string */
734 int start, /* Start index */
735 int end, /* Stop index */
736 int direction /* Tail end: -1 prefix, +1 suffix */
737 );
738
739/* Return the first position of substr in str[start:end] using the
740 given search direction or -1 if not found. */
741
742extern DL_IMPORT(int) PyUnicode_Find(
743 PyObject *str, /* String */
744 PyObject *substr, /* Substring to find */
745 int start, /* Start index */
746 int end, /* Stop index */
747 int direction /* Find direction: +1 forward, -1 backward */
748 );
749
Barry Warsaw51ac5802000-03-20 16:36:48 +0000750/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000751
752extern DL_IMPORT(int) PyUnicode_Count(
753 PyObject *str, /* String */
754 PyObject *substr, /* Substring to count */
755 int start, /* Start index */
756 int end /* Stop index */
757 );
758
Barry Warsaw51ac5802000-03-20 16:36:48 +0000759/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000760 and return the resulting Unicode object. */
761
762extern DL_IMPORT(PyObject *) PyUnicode_Replace(
763 PyObject *str, /* String */
764 PyObject *substr, /* Substring to find */
765 PyObject *replstr, /* Substring to replace */
766 int maxcount /* Max. number of replacements to apply;
767 -1 = all */
768 );
769
770/* Compare two strings and return -1, 0, 1 for less than, equal,
771 greater than resp. */
772
773extern DL_IMPORT(int) PyUnicode_Compare(
774 PyObject *left, /* Left string */
775 PyObject *right /* Right string */
776 );
777
778/* Apply a argument tuple or dictionar to a format string and return
779 the resulting Unicode string. */
780
781extern DL_IMPORT(PyObject *) PyUnicode_Format(
782 PyObject *format, /* Format string */
783 PyObject *args /* Argument tuple or dictionary */
784 );
785
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000786/* Checks whether element is contained in container and return 1/0
787 accordingly.
788
789 element has to coerce to an one element Unicode string. -1 is
790 returned in case of an error. */
791
792extern DL_IMPORT(int) PyUnicode_Contains(
793 PyObject *container, /* Container string */
794 PyObject *element /* Element string */
795 );
796
Guido van Rossumd8225182000-03-10 22:33:05 +0000797/* === Characters Type APIs =============================================== */
798
799/* These should not be used directly. Use the Py_UNICODE_IS* and
800 Py_UNICODE_TO* macros instead.
801
802 These APIs are implemented in Objects/unicodectype.c.
803
804*/
805
806extern DL_IMPORT(int) _PyUnicode_IsLowercase(
807 register const Py_UNICODE ch /* Unicode character */
808 );
809
810extern DL_IMPORT(int) _PyUnicode_IsUppercase(
811 register const Py_UNICODE ch /* Unicode character */
812 );
813
814extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
815 register const Py_UNICODE ch /* Unicode character */
816 );
817
818extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
819 register const Py_UNICODE ch /* Unicode character */
820 );
821
822extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
823 register const Py_UNICODE ch /* Unicode character */
824 );
825
826extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
827 register const Py_UNICODE ch /* Unicode character */
828 );
829
830extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
831 register const Py_UNICODE ch /* Unicode character */
832 );
833
834extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
835 register const Py_UNICODE ch /* Unicode character */
836 );
837
838extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
839 register const Py_UNICODE ch /* Unicode character */
840 );
841
842extern DL_IMPORT(int) _PyUnicode_ToDigit(
843 register const Py_UNICODE ch /* Unicode character */
844 );
845
846extern DL_IMPORT(double) _PyUnicode_ToNumeric(
847 register const Py_UNICODE ch /* Unicode character */
848 );
849
850extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
851 register const Py_UNICODE ch /* Unicode character */
852 );
853
854extern DL_IMPORT(int) _PyUnicode_IsDigit(
855 register const Py_UNICODE ch /* Unicode character */
856 );
857
858extern DL_IMPORT(int) _PyUnicode_IsNumeric(
859 register const Py_UNICODE ch /* Unicode character */
860 );
861
862#ifdef __cplusplus
863}
864#endif
865#endif /* !Py_UNICODEOBJECT_H */