blob: 967334ae2dce0968303a8e8129b00156cea07815 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114 configure Python using --with-ctype-functions. This reduces the
115 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#else
141
142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#endif
162
163#define Py_UNICODE_COPY(target, source, length)\
164 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
165
166#define Py_UNICODE_FILL(target, value, length) do\
167 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
168 while (0)
169
170#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000171 ((*((string)->str + (offset)) == *((substring)->str)) &&\
172 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000173 (substring)->length*sizeof(Py_UNICODE)))
174
Barry Warsaw51ac5802000-03-20 16:36:48 +0000175#ifdef __cplusplus
176extern "C" {
177#endif
178
Guido van Rossumd8225182000-03-10 22:33:05 +0000179/* --- Unicode Type ------------------------------------------------------- */
180
181typedef struct {
182 PyObject_HEAD
183 int length; /* Length of raw Unicode data in buffer */
184 Py_UNICODE *str; /* Raw Unicode buffer */
185 long hash; /* Hash value; -1 if not set */
186 PyObject *utf8str; /* UTF-8 encoded version as Python string,
187 or NULL */
188} PyUnicodeObject;
189
190extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
191
192#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
193
194/* Fast access macros */
195#define PyUnicode_GET_SIZE(op) \
196 (((PyUnicodeObject *)(op))->length)
197#define PyUnicode_GET_DATA_SIZE(op) \
198 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
199#define PyUnicode_AS_UNICODE(op) \
200 (((PyUnicodeObject *)(op))->str)
201#define PyUnicode_AS_DATA(op) \
202 ((const char *)((PyUnicodeObject *)(op))->str)
203
204/* --- Constants ---------------------------------------------------------- */
205
206/* This Unicode character will be used as replacement character during
207 decoding if the errors argument is set to "replace". Note: the
208 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
209 Unicode 3.0. */
210
211#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
212
213/* === Public API ========================================================= */
214
215/* --- Plain Py_UNICODE --------------------------------------------------- */
216
217/* Create a Unicode Object from the Py_UNICODE buffer u of the given
218 size. u may be NULL which causes the contents to be undefined. It
219 is the user's responsibility to fill in the needed data.
220
221 The buffer is copied into the new object. */
222
223extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
224 const Py_UNICODE *u, /* Unicode buffer */
225 int size /* size of buffer */
226 );
227
228/* Return a read-only pointer to the Unicode object's internal
229 Py_UNICODE buffer. */
230
231extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
232 PyObject *unicode /* Unicode object */
233 );
234
235/* Get the length of the Unicode object. */
236
237extern DL_IMPORT(int) PyUnicode_GetSize(
238 PyObject *unicode /* Unicode object */
239 );
240
Guido van Rossum52c23592000-04-10 13:41:41 +0000241/* Resize an already allocated Unicode object to the new size length.
242
243 *unicode is modified to point to the new (resized) object and 0
244 returned on success.
245
246 This API may only be called by the function which also called the
247 Unicode constructor. The refcount on the object must be 1. Otherwise,
248 an error is returned.
249
250 Error handling is implemented as follows: an exception is set, -1
251 is returned and *unicode left untouched.
252
253*/
254
255extern DL_IMPORT(int) PyUnicode_Resize(
256 PyObject **unicode, /* Pointer to the Unicode object */
257 int length /* New length */
258 );
259
Guido van Rossumd8225182000-03-10 22:33:05 +0000260/* Coerce obj to an Unicode object and return a reference with
261 *incremented* refcount.
262
263 Coercion is done in the following way:
264
265 1. Unicode objects are passed back as-is with incremented
266 refcount.
267
268 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000269 under the assumptions that they contain data using the current
270 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000271
272 3. All other objects raise an exception.
273
274 The API returns NULL in case of an error. The caller is responsible
275 for decref'ing the returned objects.
276
277*/
278
279extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
280 register PyObject *obj /* Object */
281 );
282
283/* --- wchar_t support for platforms which support it --------------------- */
284
285#ifdef HAVE_WCHAR_H
286
287/* Create a Unicode Object from the whcar_t buffer w of the given
288 size.
289
290 The buffer is copied into the new object. */
291
292extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
293 register const wchar_t *w, /* wchar_t buffer */
294 int size /* size of buffer */
295 );
296
297/* Copies the Unicode Object contents into the whcar_t buffer w. At
298 most size wchar_t characters are copied.
299
300 Returns the number of wchar_t characters copied or -1 in case of an
301 error. */
302
303extern DL_IMPORT(int) PyUnicode_AsWideChar(
304 PyUnicodeObject *unicode, /* Unicode object */
305 register wchar_t *w, /* wchar_t buffer */
306 int size /* size of buffer */
307 );
308
309#endif
310
311/* === Builtin Codecs =====================================================
312
313 Many of these APIs take two arguments encoding and errors. These
314 parameters encoding and errors have the same semantics as the ones
315 of the builtin unicode() API.
316
Fred Drakecb093fe2000-05-09 19:51:53 +0000317 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000318
319 Error handling is set by errors which may also be set to NULL
320 meaning to use the default handling defined for the codec. Default
321 error handling for all builtin codecs is "strict" (ValueErrors are
322 raised).
323
324 The codecs all use a similar interface. Only deviation from the
325 generic ones are documented.
326
327*/
328
Fred Drakecb093fe2000-05-09 19:51:53 +0000329/* --- Manage the default encoding ---------------------------------------- */
330
331/* Returns the currently active default encoding.
332
333 The default encoding is currently implemented as run-time settable
334 process global. This may change in future versions of the
335 interpreter to become a parameter which is managed on a per-thread
336 basis.
337
338 */
339
340extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
341
342/* Sets the currently active default encoding.
343
344 Returns 0 on success, -1 in case of an error.
345
346 */
347
348extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
349 const char *encoding /* Encoding name in standard form */
350 );
351
Guido van Rossumd8225182000-03-10 22:33:05 +0000352/* --- Generic Codecs ----------------------------------------------------- */
353
354/* Create a Unicode object by decoding the encoded string s of the
355 given size. */
356
357extern DL_IMPORT(PyObject*) PyUnicode_Decode(
358 const char *s, /* encoded string */
359 int size, /* size of buffer */
360 const char *encoding, /* encoding */
361 const char *errors /* error handling */
362 );
363
364/* Encodes a Py_UNICODE buffer of the given size and returns a
365 Python string object. */
366
367extern DL_IMPORT(PyObject*) PyUnicode_Encode(
368 const Py_UNICODE *s, /* Unicode char buffer */
369 int size, /* number of Py_UNICODE chars to encode */
370 const char *encoding, /* encoding */
371 const char *errors /* error handling */
372 );
373
374/* Encodes a Unicode object and returns the result as Python string
375 object. */
376
377extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
378 PyObject *unicode, /* Unicode object */
379 const char *encoding, /* encoding */
380 const char *errors /* error handling */
381 );
382
383/* --- UTF-8 Codecs ------------------------------------------------------- */
384
385extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
386 const char *string, /* UTF-8 encoded string */
387 int length, /* size of string */
388 const char *errors /* error handling */
389 );
390
391extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
392 PyObject *unicode /* Unicode object */
393 );
394
395extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
396 const Py_UNICODE *data, /* Unicode char buffer */
397 int length, /* number of Py_UNICODE chars to encode */
398 const char *errors /* error handling */
399 );
400
401/* --- UTF-16 Codecs ------------------------------------------------------ */
402
Guido van Rossum9e896b32000-04-05 20:11:21 +0000403/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000404 the corresponding Unicode object.
405
406 errors (if non-NULL) defines the error handling. It defaults
407 to "strict".
408
409 If byteorder is non-NULL, the decoder starts decoding using the
410 given byte order:
411
412 *byteorder == -1: little endian
413 *byteorder == 0: native order
414 *byteorder == 1: big endian
415
416 and then switches according to all BOM marks it finds in the input
417 data. BOM marks are not copied into the resulting Unicode string.
418 After completion, *byteorder is set to the current byte order at
419 the end of input data.
420
421 If byteorder is NULL, the codec starts in native order mode.
422
423*/
424
425extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
426 const char *string, /* UTF-16 encoded string */
427 int length, /* size of string */
428 const char *errors, /* error handling */
429 int *byteorder /* pointer to byteorder to use
430 0=native;-1=LE,1=BE; updated on
431 exit */
432 );
433
434/* Returns a Python string using the UTF-16 encoding in native byte
435 order. The string always starts with a BOM mark. */
436
437extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
438 PyObject *unicode /* Unicode object */
439 );
440
441/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000442 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000443
444 If byteorder is not 0, output is written according to the following
445 byte order:
446
447 byteorder == -1: little endian
448 byteorder == 0: native byte order (writes a BOM mark)
449 byteorder == 1: big endian
450
451 If byteorder is 0, the output string will always start with the
452 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
453 prepended.
454
455 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
456 UCS-2. This trick makes it possible to add full UTF-16 capabilities
457 at a later point without comprimising the APIs.
458
459*/
460
461extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
462 const Py_UNICODE *data, /* Unicode char buffer */
463 int length, /* number of Py_UNICODE chars to encode */
464 const char *errors, /* error handling */
465 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
466 );
467
468/* --- Unicode-Escape Codecs ---------------------------------------------- */
469
470extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
471 const char *string, /* Unicode-Escape encoded string */
472 int length, /* size of string */
473 const char *errors /* error handling */
474 );
475
476extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
477 PyObject *unicode /* Unicode object */
478 );
479
480extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
481 const Py_UNICODE *data, /* Unicode char buffer */
482 int length /* Number of Py_UNICODE chars to encode */
483 );
484
485/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
486
487extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
488 const char *string, /* Raw-Unicode-Escape encoded string */
489 int length, /* size of string */
490 const char *errors /* error handling */
491 );
492
493extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
494 PyObject *unicode /* Unicode object */
495 );
496
497extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
498 const Py_UNICODE *data, /* Unicode char buffer */
499 int length /* Number of Py_UNICODE chars to encode */
500 );
501
502/* --- Latin-1 Codecs -----------------------------------------------------
503
504 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
505
506*/
507
508extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
509 const char *string, /* Latin-1 encoded string */
510 int length, /* size of string */
511 const char *errors /* error handling */
512 );
513
514extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
515 PyObject *unicode /* Unicode object */
516 );
517
518extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
519 const Py_UNICODE *data, /* Unicode char buffer */
520 int length, /* Number of Py_UNICODE chars to encode */
521 const char *errors /* error handling */
522 );
523
524/* --- ASCII Codecs -------------------------------------------------------
525
526 Only 7-bit ASCII data is excepted. All other codes generate errors.
527
528*/
529
530extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
531 const char *string, /* ASCII encoded string */
532 int length, /* size of string */
533 const char *errors /* error handling */
534 );
535
536extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
537 PyObject *unicode /* Unicode object */
538 );
539
540extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
541 const Py_UNICODE *data, /* Unicode char buffer */
542 int length, /* Number of Py_UNICODE chars to encode */
543 const char *errors /* error handling */
544 );
545
546/* --- Character Map Codecs -----------------------------------------------
547
548 This codec uses mappings to encode and decode characters.
549
550 Decoding mappings must map single string characters to single
551 Unicode characters, integers (which are then interpreted as Unicode
552 ordinals) or None (meaning "undefined mapping" and causing an
553 error).
554
555 Encoding mappings must map single Unicode characters to single
556 string characters, integers (which are then interpreted as Latin-1
557 ordinals) or None (meaning "undefined mapping" and causing an
558 error).
559
560 If a character lookup fails with a LookupError, the character is
561 copied as-is meaning that its ordinal value will be interpreted as
562 Unicode or Latin-1 ordinal resp. Because of this mappings only need
563 to contain those mappings which map characters to different code
564 points.
565
566*/
567
568extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
569 const char *string, /* Encoded string */
570 int length, /* size of string */
571 PyObject *mapping, /* character mapping
572 (char ordinal -> unicode ordinal) */
573 const char *errors /* error handling */
574 );
575
576extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
577 PyObject *unicode, /* Unicode object */
578 PyObject *mapping /* character mapping
579 (unicode ordinal -> char ordinal) */
580 );
581
582extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
583 const Py_UNICODE *data, /* Unicode char buffer */
584 int length, /* Number of Py_UNICODE chars to encode */
585 PyObject *mapping, /* character mapping
586 (unicode ordinal -> char ordinal) */
587 const char *errors /* error handling */
588 );
589
590/* Translate a Py_UNICODE buffer of the given length by applying a
591 character mapping table to it and return the resulting Unicode
592 object.
593
594 The mapping table must map Unicode ordinal integers to Unicode
595 ordinal integers or None (causing deletion of the character).
596
597 Mapping tables may be dictionaries or sequences. Unmapped character
598 ordinals (ones which cause a LookupError) are left untouched and
599 are copied as-is.
600
601*/
602
603extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
604 const Py_UNICODE *data, /* Unicode char buffer */
605 int length, /* Number of Py_UNICODE chars to encode */
606 PyObject *table, /* Translate table */
607 const char *errors /* error handling */
608 );
609
Guido van Rossumefec1152000-03-28 02:01:15 +0000610#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000611
Guido van Rossumefec1152000-03-28 02:01:15 +0000612/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000613
Guido van Rossumefec1152000-03-28 02:01:15 +0000614extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
615 const char *string, /* MBCS encoded string */
616 int length, /* size of string */
617 const char *errors /* error handling */
618 );
619
620extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
621 PyObject *unicode /* Unicode object */
622 );
623
624extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
625 const Py_UNICODE *data, /* Unicode char buffer */
626 int length, /* Number of Py_UNICODE chars to encode */
627 const char *errors /* error handling */
628 );
629
Guido van Rossumefec1152000-03-28 02:01:15 +0000630#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000631
Guido van Rossum9e896b32000-04-05 20:11:21 +0000632/* --- Decimal Encoder ---------------------------------------------------- */
633
634/* Takes a Unicode string holding a decimal value and writes it into
635 an output buffer using standard ASCII digit codes.
636
637 The output buffer has to provide at least length+1 bytes of storage
638 area. The output string is 0-terminated.
639
640 The encoder converts whitespace to ' ', decimal characters to their
641 corresponding ASCII digit and all other Latin-1 characters except
642 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
643 are treated as errors. This includes embedded NULL bytes.
644
645 Error handling is defined by the errors argument:
646
647 NULL or "strict": raise a ValueError
648 "ignore": ignore the wrong characters (these are not copied to the
649 output buffer)
650 "replace": replaces illegal characters with '?'
651
652 Returns 0 on success, -1 on failure.
653
654*/
655
656extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
657 Py_UNICODE *s, /* Unicode buffer */
658 int length, /* Number of Py_UNICODE chars to encode */
659 char *output, /* Output buffer; must have size >= length */
660 const char *errors /* error handling */
661 );
662
Guido van Rossumd8225182000-03-10 22:33:05 +0000663/* --- Methods & Slots ----------------------------------------------------
664
665 These are capable of handling Unicode objects and strings on input
666 (we refer to them as strings in the descriptions) and return
667 Unicode objects or integers as apporpriate. */
668
669/* Concat two strings giving a new Unicode string. */
670
671extern DL_IMPORT(PyObject*) PyUnicode_Concat(
672 PyObject *left, /* Left string */
673 PyObject *right /* Right string */
674 );
675
676/* Split a string giving a list of Unicode strings.
677
678 If sep is NULL, splitting will be done at all whitespace
679 substrings. Otherwise, splits occur at the given separator.
680
681 At most maxsplit splits will be done. If negative, no limit is set.
682
683 Separators are not included in the resulting list.
684
685*/
686
687extern DL_IMPORT(PyObject*) PyUnicode_Split(
688 PyObject *s, /* String to split */
689 PyObject *sep, /* String separator */
690 int maxsplit /* Maxsplit count */
691 );
692
693/* Dito, but split at line breaks.
694
695 CRLF is considered to be one line break. Line breaks are not
696 included in the resulting list. */
697
698extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
699 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000700 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000701 );
702
703/* Translate a string by applying a character mapping table to it and
704 return the resulting Unicode object.
705
706 The mapping table must map Unicode ordinal integers to Unicode
707 ordinal integers or None (causing deletion of the character).
708
709 Mapping tables may be dictionaries or sequences. Unmapped character
710 ordinals (ones which cause a LookupError) are left untouched and
711 are copied as-is.
712
713*/
714
715extern DL_IMPORT(PyObject *) PyUnicode_Translate(
716 PyObject *str, /* String */
717 PyObject *table, /* Translate table */
718 const char *errors /* error handling */
719 );
720
721/* Join a sequence of strings using the given separator and return
722 the resulting Unicode string. */
723
724extern DL_IMPORT(PyObject*) PyUnicode_Join(
725 PyObject *separator, /* Separator string */
726 PyObject *seq /* Sequence object */
727 );
728
729/* Return 1 if substr matches str[start:end] at the given tail end, 0
730 otherwise. */
731
732extern DL_IMPORT(int) PyUnicode_Tailmatch(
733 PyObject *str, /* String */
734 PyObject *substr, /* Prefix or Suffix string */
735 int start, /* Start index */
736 int end, /* Stop index */
737 int direction /* Tail end: -1 prefix, +1 suffix */
738 );
739
740/* Return the first position of substr in str[start:end] using the
741 given search direction or -1 if not found. */
742
743extern DL_IMPORT(int) PyUnicode_Find(
744 PyObject *str, /* String */
745 PyObject *substr, /* Substring to find */
746 int start, /* Start index */
747 int end, /* Stop index */
748 int direction /* Find direction: +1 forward, -1 backward */
749 );
750
Barry Warsaw51ac5802000-03-20 16:36:48 +0000751/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000752
753extern DL_IMPORT(int) PyUnicode_Count(
754 PyObject *str, /* String */
755 PyObject *substr, /* Substring to count */
756 int start, /* Start index */
757 int end /* Stop index */
758 );
759
Barry Warsaw51ac5802000-03-20 16:36:48 +0000760/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000761 and return the resulting Unicode object. */
762
763extern DL_IMPORT(PyObject *) PyUnicode_Replace(
764 PyObject *str, /* String */
765 PyObject *substr, /* Substring to find */
766 PyObject *replstr, /* Substring to replace */
767 int maxcount /* Max. number of replacements to apply;
768 -1 = all */
769 );
770
771/* Compare two strings and return -1, 0, 1 for less than, equal,
772 greater than resp. */
773
774extern DL_IMPORT(int) PyUnicode_Compare(
775 PyObject *left, /* Left string */
776 PyObject *right /* Right string */
777 );
778
779/* Apply a argument tuple or dictionar to a format string and return
780 the resulting Unicode string. */
781
782extern DL_IMPORT(PyObject *) PyUnicode_Format(
783 PyObject *format, /* Format string */
784 PyObject *args /* Argument tuple or dictionary */
785 );
786
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000787/* Checks whether element is contained in container and return 1/0
788 accordingly.
789
790 element has to coerce to an one element Unicode string. -1 is
791 returned in case of an error. */
792
793extern DL_IMPORT(int) PyUnicode_Contains(
794 PyObject *container, /* Container string */
795 PyObject *element /* Element string */
796 );
797
Guido van Rossumd8225182000-03-10 22:33:05 +0000798/* === Characters Type APIs =============================================== */
799
800/* These should not be used directly. Use the Py_UNICODE_IS* and
801 Py_UNICODE_TO* macros instead.
802
803 These APIs are implemented in Objects/unicodectype.c.
804
805*/
806
807extern DL_IMPORT(int) _PyUnicode_IsLowercase(
808 register const Py_UNICODE ch /* Unicode character */
809 );
810
811extern DL_IMPORT(int) _PyUnicode_IsUppercase(
812 register const Py_UNICODE ch /* Unicode character */
813 );
814
815extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
816 register const Py_UNICODE ch /* Unicode character */
817 );
818
819extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
820 register const Py_UNICODE ch /* Unicode character */
821 );
822
823extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
824 register const Py_UNICODE ch /* Unicode character */
825 );
826
827extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
828 register const Py_UNICODE ch /* Unicode character */
829 );
830
831extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
832 register const Py_UNICODE ch /* Unicode character */
833 );
834
835extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
836 register const Py_UNICODE ch /* Unicode character */
837 );
838
839extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
840 register const Py_UNICODE ch /* Unicode character */
841 );
842
843extern DL_IMPORT(int) _PyUnicode_ToDigit(
844 register const Py_UNICODE ch /* Unicode character */
845 );
846
847extern DL_IMPORT(double) _PyUnicode_ToNumeric(
848 register const Py_UNICODE ch /* Unicode character */
849 );
850
851extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
852 register const Py_UNICODE ch /* Unicode character */
853 );
854
855extern DL_IMPORT(int) _PyUnicode_IsDigit(
856 register const Py_UNICODE ch /* Unicode character */
857 );
858
859extern DL_IMPORT(int) _PyUnicode_IsNumeric(
860 register const Py_UNICODE ch /* Unicode character */
861 );
862
863#ifdef __cplusplus
864}
865#endif
866#endif /* !Py_UNICODEOBJECT_H */