blob: 01dce9469c34c5c1d71c1c93f6ca5b58b2a02653 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
Marc-André Lemburg43279102000-07-07 09:01:41 +0000110/*
111 * Use this typedef when you need to represent a UTF-16 surrogate pair
112 * as single unsigned integer.
113 */
114#if SIZEOF_INT >= 4
115typedef unsigned int Py_UCS4;
116#elif SIZEOF_LONG >= 4
117typedef unsigned long Py_UCS4;
118#endif
119
120
Guido van Rossumd8225182000-03-10 22:33:05 +0000121/* --- Internal Unicode Operations ---------------------------------------- */
122
123/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000124 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
125 configure Python using --with-ctype-functions. This reduces the
126 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000127
128#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
129
130#include "wctype.h"
131
132#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
133
134#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
135#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
136#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
137#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
138
139#define Py_UNICODE_TOLOWER(ch) towlower(ch)
140#define Py_UNICODE_TOUPPER(ch) towupper(ch)
141#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
142
143#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
144#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
145#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
146
147#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
148#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
149#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
150
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000151#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
152
Guido van Rossumd8225182000-03-10 22:33:05 +0000153#else
154
155#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
156
157#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
158#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
159#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
160#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
161
162#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
163#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
164#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
165
166#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
167#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
168#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
169
170#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
171#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
172#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
173
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000174#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000175
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000176#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000177
178#define Py_UNICODE_ISALNUM(ch) \
179 (Py_UNICODE_ISALPHA(ch) || \
180 Py_UNICODE_ISDECIMAL(ch) || \
181 Py_UNICODE_ISDIGIT(ch) || \
182 Py_UNICODE_ISNUMERIC(ch))
183
Guido van Rossumd8225182000-03-10 22:33:05 +0000184#define Py_UNICODE_COPY(target, source, length)\
185 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
186
187#define Py_UNICODE_FILL(target, value, length) do\
188 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
189 while (0)
190
191#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000192 ((*((string)->str + (offset)) == *((substring)->str)) &&\
193 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000194 (substring)->length*sizeof(Py_UNICODE)))
195
Barry Warsaw51ac5802000-03-20 16:36:48 +0000196#ifdef __cplusplus
197extern "C" {
198#endif
199
Guido van Rossumd8225182000-03-10 22:33:05 +0000200/* --- Unicode Type ------------------------------------------------------- */
201
202typedef struct {
203 PyObject_HEAD
204 int length; /* Length of raw Unicode data in buffer */
205 Py_UNICODE *str; /* Raw Unicode buffer */
206 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000207 PyObject *defenc; /* (Default) Encoded version as Python
208 string, or NULL; this is used for
209 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000210} PyUnicodeObject;
211
212extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
213
214#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
215
216/* Fast access macros */
217#define PyUnicode_GET_SIZE(op) \
218 (((PyUnicodeObject *)(op))->length)
219#define PyUnicode_GET_DATA_SIZE(op) \
220 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
221#define PyUnicode_AS_UNICODE(op) \
222 (((PyUnicodeObject *)(op))->str)
223#define PyUnicode_AS_DATA(op) \
224 ((const char *)((PyUnicodeObject *)(op))->str)
225
226/* --- Constants ---------------------------------------------------------- */
227
228/* This Unicode character will be used as replacement character during
229 decoding if the errors argument is set to "replace". Note: the
230 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
231 Unicode 3.0. */
232
233#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
234
235/* === Public API ========================================================= */
236
237/* --- Plain Py_UNICODE --------------------------------------------------- */
238
239/* Create a Unicode Object from the Py_UNICODE buffer u of the given
240 size. u may be NULL which causes the contents to be undefined. It
241 is the user's responsibility to fill in the needed data.
242
243 The buffer is copied into the new object. */
244
245extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
246 const Py_UNICODE *u, /* Unicode buffer */
247 int size /* size of buffer */
248 );
249
250/* Return a read-only pointer to the Unicode object's internal
251 Py_UNICODE buffer. */
252
253extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
254 PyObject *unicode /* Unicode object */
255 );
256
257/* Get the length of the Unicode object. */
258
259extern DL_IMPORT(int) PyUnicode_GetSize(
260 PyObject *unicode /* Unicode object */
261 );
262
Guido van Rossum52c23592000-04-10 13:41:41 +0000263/* Resize an already allocated Unicode object to the new size length.
264
265 *unicode is modified to point to the new (resized) object and 0
266 returned on success.
267
268 This API may only be called by the function which also called the
269 Unicode constructor. The refcount on the object must be 1. Otherwise,
270 an error is returned.
271
272 Error handling is implemented as follows: an exception is set, -1
273 is returned and *unicode left untouched.
274
275*/
276
277extern DL_IMPORT(int) PyUnicode_Resize(
278 PyObject **unicode, /* Pointer to the Unicode object */
279 int length /* New length */
280 );
281
Guido van Rossumd8225182000-03-10 22:33:05 +0000282/* Coerce obj to an Unicode object and return a reference with
283 *incremented* refcount.
284
285 Coercion is done in the following way:
286
287 1. Unicode objects are passed back as-is with incremented
288 refcount.
289
290 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000291 under the assumptions that they contain data using the current
292 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000293
294 3. All other objects raise an exception.
295
296 The API returns NULL in case of an error. The caller is responsible
297 for decref'ing the returned objects.
298
299*/
300
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000301extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
302 register PyObject *obj, /* Object */
303 const char *encoding, /* encoding */
304 const char *errors /* error handling */
305 );
306
307/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
308 which results in using the default encoding as basis for
309 decoding the object.
310
311 Coerces obj to an Unicode object and return a reference with
312 *incremented* refcount.
313
314 The API returns NULL in case of an error. The caller is responsible
315 for decref'ing the returned objects.
316
317*/
318
Guido van Rossumd8225182000-03-10 22:33:05 +0000319extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
320 register PyObject *obj /* Object */
321 );
322
323/* --- wchar_t support for platforms which support it --------------------- */
324
325#ifdef HAVE_WCHAR_H
326
327/* Create a Unicode Object from the whcar_t buffer w of the given
328 size.
329
330 The buffer is copied into the new object. */
331
332extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
333 register const wchar_t *w, /* wchar_t buffer */
334 int size /* size of buffer */
335 );
336
337/* Copies the Unicode Object contents into the whcar_t buffer w. At
338 most size wchar_t characters are copied.
339
340 Returns the number of wchar_t characters copied or -1 in case of an
341 error. */
342
343extern DL_IMPORT(int) PyUnicode_AsWideChar(
344 PyUnicodeObject *unicode, /* Unicode object */
345 register wchar_t *w, /* wchar_t buffer */
346 int size /* size of buffer */
347 );
348
349#endif
350
351/* === Builtin Codecs =====================================================
352
353 Many of these APIs take two arguments encoding and errors. These
354 parameters encoding and errors have the same semantics as the ones
355 of the builtin unicode() API.
356
Fred Drakecb093fe2000-05-09 19:51:53 +0000357 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000358
359 Error handling is set by errors which may also be set to NULL
360 meaning to use the default handling defined for the codec. Default
361 error handling for all builtin codecs is "strict" (ValueErrors are
362 raised).
363
364 The codecs all use a similar interface. Only deviation from the
365 generic ones are documented.
366
367*/
368
Fred Drakecb093fe2000-05-09 19:51:53 +0000369/* --- Manage the default encoding ---------------------------------------- */
370
371/* Returns the currently active default encoding.
372
373 The default encoding is currently implemented as run-time settable
374 process global. This may change in future versions of the
375 interpreter to become a parameter which is managed on a per-thread
376 basis.
377
378 */
379
Thomas Wouters5f375912000-07-22 23:30:03 +0000380extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000381
382/* Sets the currently active default encoding.
383
384 Returns 0 on success, -1 in case of an error.
385
386 */
387
388extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
389 const char *encoding /* Encoding name in standard form */
390 );
391
Guido van Rossumd8225182000-03-10 22:33:05 +0000392/* --- Generic Codecs ----------------------------------------------------- */
393
394/* Create a Unicode object by decoding the encoded string s of the
395 given size. */
396
397extern DL_IMPORT(PyObject*) PyUnicode_Decode(
398 const char *s, /* encoded string */
399 int size, /* size of buffer */
400 const char *encoding, /* encoding */
401 const char *errors /* error handling */
402 );
403
404/* Encodes a Py_UNICODE buffer of the given size and returns a
405 Python string object. */
406
407extern DL_IMPORT(PyObject*) PyUnicode_Encode(
408 const Py_UNICODE *s, /* Unicode char buffer */
409 int size, /* number of Py_UNICODE chars to encode */
410 const char *encoding, /* encoding */
411 const char *errors /* error handling */
412 );
413
414/* Encodes a Unicode object and returns the result as Python string
415 object. */
416
417extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
418 PyObject *unicode, /* Unicode object */
419 const char *encoding, /* encoding */
420 const char *errors /* error handling */
421 );
422
423/* --- UTF-8 Codecs ------------------------------------------------------- */
424
425extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
426 const char *string, /* UTF-8 encoded string */
427 int length, /* size of string */
428 const char *errors /* error handling */
429 );
430
431extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
432 PyObject *unicode /* Unicode object */
433 );
434
435extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
436 const Py_UNICODE *data, /* Unicode char buffer */
437 int length, /* number of Py_UNICODE chars to encode */
438 const char *errors /* error handling */
439 );
440
441/* --- UTF-16 Codecs ------------------------------------------------------ */
442
Guido van Rossum9e896b32000-04-05 20:11:21 +0000443/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000444 the corresponding Unicode object.
445
446 errors (if non-NULL) defines the error handling. It defaults
447 to "strict".
448
449 If byteorder is non-NULL, the decoder starts decoding using the
450 given byte order:
451
452 *byteorder == -1: little endian
453 *byteorder == 0: native order
454 *byteorder == 1: big endian
455
456 and then switches according to all BOM marks it finds in the input
457 data. BOM marks are not copied into the resulting Unicode string.
458 After completion, *byteorder is set to the current byte order at
459 the end of input data.
460
461 If byteorder is NULL, the codec starts in native order mode.
462
463*/
464
465extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
466 const char *string, /* UTF-16 encoded string */
467 int length, /* size of string */
468 const char *errors, /* error handling */
469 int *byteorder /* pointer to byteorder to use
470 0=native;-1=LE,1=BE; updated on
471 exit */
472 );
473
474/* Returns a Python string using the UTF-16 encoding in native byte
475 order. The string always starts with a BOM mark. */
476
477extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
478 PyObject *unicode /* Unicode object */
479 );
480
481/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000482 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000483
484 If byteorder is not 0, output is written according to the following
485 byte order:
486
487 byteorder == -1: little endian
488 byteorder == 0: native byte order (writes a BOM mark)
489 byteorder == 1: big endian
490
491 If byteorder is 0, the output string will always start with the
492 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
493 prepended.
494
495 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
496 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000497 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000498
499*/
500
501extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
502 const Py_UNICODE *data, /* Unicode char buffer */
503 int length, /* number of Py_UNICODE chars to encode */
504 const char *errors, /* error handling */
505 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
506 );
507
508/* --- Unicode-Escape Codecs ---------------------------------------------- */
509
510extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
511 const char *string, /* Unicode-Escape encoded string */
512 int length, /* size of string */
513 const char *errors /* error handling */
514 );
515
516extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
517 PyObject *unicode /* Unicode object */
518 );
519
520extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
521 const Py_UNICODE *data, /* Unicode char buffer */
522 int length /* Number of Py_UNICODE chars to encode */
523 );
524
525/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
526
527extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
528 const char *string, /* Raw-Unicode-Escape encoded string */
529 int length, /* size of string */
530 const char *errors /* error handling */
531 );
532
533extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
534 PyObject *unicode /* Unicode object */
535 );
536
537extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
538 const Py_UNICODE *data, /* Unicode char buffer */
539 int length /* Number of Py_UNICODE chars to encode */
540 );
541
542/* --- Latin-1 Codecs -----------------------------------------------------
543
544 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
545
546*/
547
548extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
549 const char *string, /* Latin-1 encoded string */
550 int length, /* size of string */
551 const char *errors /* error handling */
552 );
553
554extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
555 PyObject *unicode /* Unicode object */
556 );
557
558extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
559 const Py_UNICODE *data, /* Unicode char buffer */
560 int length, /* Number of Py_UNICODE chars to encode */
561 const char *errors /* error handling */
562 );
563
564/* --- ASCII Codecs -------------------------------------------------------
565
566 Only 7-bit ASCII data is excepted. All other codes generate errors.
567
568*/
569
570extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
571 const char *string, /* ASCII encoded string */
572 int length, /* size of string */
573 const char *errors /* error handling */
574 );
575
576extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
577 PyObject *unicode /* Unicode object */
578 );
579
580extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
581 const Py_UNICODE *data, /* Unicode char buffer */
582 int length, /* Number of Py_UNICODE chars to encode */
583 const char *errors /* error handling */
584 );
585
586/* --- Character Map Codecs -----------------------------------------------
587
588 This codec uses mappings to encode and decode characters.
589
590 Decoding mappings must map single string characters to single
591 Unicode characters, integers (which are then interpreted as Unicode
592 ordinals) or None (meaning "undefined mapping" and causing an
593 error).
594
595 Encoding mappings must map single Unicode characters to single
596 string characters, integers (which are then interpreted as Latin-1
597 ordinals) or None (meaning "undefined mapping" and causing an
598 error).
599
600 If a character lookup fails with a LookupError, the character is
601 copied as-is meaning that its ordinal value will be interpreted as
602 Unicode or Latin-1 ordinal resp. Because of this mappings only need
603 to contain those mappings which map characters to different code
604 points.
605
606*/
607
608extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
609 const char *string, /* Encoded string */
610 int length, /* size of string */
611 PyObject *mapping, /* character mapping
612 (char ordinal -> unicode ordinal) */
613 const char *errors /* error handling */
614 );
615
616extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
617 PyObject *unicode, /* Unicode object */
618 PyObject *mapping /* character mapping
619 (unicode ordinal -> char ordinal) */
620 );
621
622extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
623 const Py_UNICODE *data, /* Unicode char buffer */
624 int length, /* Number of Py_UNICODE chars to encode */
625 PyObject *mapping, /* character mapping
626 (unicode ordinal -> char ordinal) */
627 const char *errors /* error handling */
628 );
629
630/* Translate a Py_UNICODE buffer of the given length by applying a
631 character mapping table to it and return the resulting Unicode
632 object.
633
634 The mapping table must map Unicode ordinal integers to Unicode
635 ordinal integers or None (causing deletion of the character).
636
637 Mapping tables may be dictionaries or sequences. Unmapped character
638 ordinals (ones which cause a LookupError) are left untouched and
639 are copied as-is.
640
641*/
642
643extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
644 const Py_UNICODE *data, /* Unicode char buffer */
645 int length, /* Number of Py_UNICODE chars to encode */
646 PyObject *table, /* Translate table */
647 const char *errors /* error handling */
648 );
649
Guido van Rossumefec1152000-03-28 02:01:15 +0000650#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000651
Guido van Rossumefec1152000-03-28 02:01:15 +0000652/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000653
Guido van Rossumefec1152000-03-28 02:01:15 +0000654extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
655 const char *string, /* MBCS encoded string */
656 int length, /* size of string */
657 const char *errors /* error handling */
658 );
659
660extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
661 PyObject *unicode /* Unicode object */
662 );
663
664extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
665 const Py_UNICODE *data, /* Unicode char buffer */
666 int length, /* Number of Py_UNICODE chars to encode */
667 const char *errors /* error handling */
668 );
669
Guido van Rossumefec1152000-03-28 02:01:15 +0000670#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000671
Guido van Rossum9e896b32000-04-05 20:11:21 +0000672/* --- Decimal Encoder ---------------------------------------------------- */
673
674/* Takes a Unicode string holding a decimal value and writes it into
675 an output buffer using standard ASCII digit codes.
676
677 The output buffer has to provide at least length+1 bytes of storage
678 area. The output string is 0-terminated.
679
680 The encoder converts whitespace to ' ', decimal characters to their
681 corresponding ASCII digit and all other Latin-1 characters except
682 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
683 are treated as errors. This includes embedded NULL bytes.
684
685 Error handling is defined by the errors argument:
686
687 NULL or "strict": raise a ValueError
688 "ignore": ignore the wrong characters (these are not copied to the
689 output buffer)
690 "replace": replaces illegal characters with '?'
691
692 Returns 0 on success, -1 on failure.
693
694*/
695
696extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
697 Py_UNICODE *s, /* Unicode buffer */
698 int length, /* Number of Py_UNICODE chars to encode */
699 char *output, /* Output buffer; must have size >= length */
700 const char *errors /* error handling */
701 );
702
Guido van Rossumd8225182000-03-10 22:33:05 +0000703/* --- Methods & Slots ----------------------------------------------------
704
705 These are capable of handling Unicode objects and strings on input
706 (we refer to them as strings in the descriptions) and return
707 Unicode objects or integers as apporpriate. */
708
709/* Concat two strings giving a new Unicode string. */
710
711extern DL_IMPORT(PyObject*) PyUnicode_Concat(
712 PyObject *left, /* Left string */
713 PyObject *right /* Right string */
714 );
715
716/* Split a string giving a list of Unicode strings.
717
718 If sep is NULL, splitting will be done at all whitespace
719 substrings. Otherwise, splits occur at the given separator.
720
721 At most maxsplit splits will be done. If negative, no limit is set.
722
723 Separators are not included in the resulting list.
724
725*/
726
727extern DL_IMPORT(PyObject*) PyUnicode_Split(
728 PyObject *s, /* String to split */
729 PyObject *sep, /* String separator */
730 int maxsplit /* Maxsplit count */
731 );
732
733/* Dito, but split at line breaks.
734
735 CRLF is considered to be one line break. Line breaks are not
736 included in the resulting list. */
737
738extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
739 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000740 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000741 );
742
743/* Translate a string by applying a character mapping table to it and
744 return the resulting Unicode object.
745
746 The mapping table must map Unicode ordinal integers to Unicode
747 ordinal integers or None (causing deletion of the character).
748
749 Mapping tables may be dictionaries or sequences. Unmapped character
750 ordinals (ones which cause a LookupError) are left untouched and
751 are copied as-is.
752
753*/
754
755extern DL_IMPORT(PyObject *) PyUnicode_Translate(
756 PyObject *str, /* String */
757 PyObject *table, /* Translate table */
758 const char *errors /* error handling */
759 );
760
761/* Join a sequence of strings using the given separator and return
762 the resulting Unicode string. */
763
764extern DL_IMPORT(PyObject*) PyUnicode_Join(
765 PyObject *separator, /* Separator string */
766 PyObject *seq /* Sequence object */
767 );
768
769/* Return 1 if substr matches str[start:end] at the given tail end, 0
770 otherwise. */
771
772extern DL_IMPORT(int) PyUnicode_Tailmatch(
773 PyObject *str, /* String */
774 PyObject *substr, /* Prefix or Suffix string */
775 int start, /* Start index */
776 int end, /* Stop index */
777 int direction /* Tail end: -1 prefix, +1 suffix */
778 );
779
780/* Return the first position of substr in str[start:end] using the
781 given search direction or -1 if not found. */
782
783extern DL_IMPORT(int) PyUnicode_Find(
784 PyObject *str, /* String */
785 PyObject *substr, /* Substring to find */
786 int start, /* Start index */
787 int end, /* Stop index */
788 int direction /* Find direction: +1 forward, -1 backward */
789 );
790
Barry Warsaw51ac5802000-03-20 16:36:48 +0000791/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000792
793extern DL_IMPORT(int) PyUnicode_Count(
794 PyObject *str, /* String */
795 PyObject *substr, /* Substring to count */
796 int start, /* Start index */
797 int end /* Stop index */
798 );
799
Barry Warsaw51ac5802000-03-20 16:36:48 +0000800/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000801 and return the resulting Unicode object. */
802
803extern DL_IMPORT(PyObject *) PyUnicode_Replace(
804 PyObject *str, /* String */
805 PyObject *substr, /* Substring to find */
806 PyObject *replstr, /* Substring to replace */
807 int maxcount /* Max. number of replacements to apply;
808 -1 = all */
809 );
810
811/* Compare two strings and return -1, 0, 1 for less than, equal,
812 greater than resp. */
813
814extern DL_IMPORT(int) PyUnicode_Compare(
815 PyObject *left, /* Left string */
816 PyObject *right /* Right string */
817 );
818
Thomas Wouters7e474022000-07-16 12:04:32 +0000819/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000820 the resulting Unicode string. */
821
822extern DL_IMPORT(PyObject *) PyUnicode_Format(
823 PyObject *format, /* Format string */
824 PyObject *args /* Argument tuple or dictionary */
825 );
826
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000827/* Checks whether element is contained in container and return 1/0
828 accordingly.
829
830 element has to coerce to an one element Unicode string. -1 is
831 returned in case of an error. */
832
833extern DL_IMPORT(int) PyUnicode_Contains(
834 PyObject *container, /* Container string */
835 PyObject *element /* Element string */
836 );
837
Guido van Rossumd8225182000-03-10 22:33:05 +0000838/* === Characters Type APIs =============================================== */
839
840/* These should not be used directly. Use the Py_UNICODE_IS* and
841 Py_UNICODE_TO* macros instead.
842
843 These APIs are implemented in Objects/unicodectype.c.
844
845*/
846
847extern DL_IMPORT(int) _PyUnicode_IsLowercase(
848 register const Py_UNICODE ch /* Unicode character */
849 );
850
851extern DL_IMPORT(int) _PyUnicode_IsUppercase(
852 register const Py_UNICODE ch /* Unicode character */
853 );
854
855extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
856 register const Py_UNICODE ch /* Unicode character */
857 );
858
859extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
860 register const Py_UNICODE ch /* Unicode character */
861 );
862
863extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
864 register const Py_UNICODE ch /* Unicode character */
865 );
866
867extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
868 register const Py_UNICODE ch /* Unicode character */
869 );
870
871extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
872 register const Py_UNICODE ch /* Unicode character */
873 );
874
875extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
876 register const Py_UNICODE ch /* Unicode character */
877 );
878
879extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
880 register const Py_UNICODE ch /* Unicode character */
881 );
882
883extern DL_IMPORT(int) _PyUnicode_ToDigit(
884 register const Py_UNICODE ch /* Unicode character */
885 );
886
887extern DL_IMPORT(double) _PyUnicode_ToNumeric(
888 register const Py_UNICODE ch /* Unicode character */
889 );
890
891extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
892 register const Py_UNICODE ch /* Unicode character */
893 );
894
895extern DL_IMPORT(int) _PyUnicode_IsDigit(
896 register const Py_UNICODE ch /* Unicode character */
897 );
898
899extern DL_IMPORT(int) _PyUnicode_IsNumeric(
900 register const Py_UNICODE ch /* Unicode character */
901 );
902
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000903extern DL_IMPORT(int) _PyUnicode_IsAlpha(
904 register const Py_UNICODE ch /* Unicode character */
905 );
906
Guido van Rossumd8225182000-03-10 22:33:05 +0000907#ifdef __cplusplus
908}
909#endif
910#endif /* !Py_UNICODEOBJECT_H */