blob: e88b8ed3399a32aaf16e455a24078fe6e90a5721 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000081# ifndef HAVE_WCHAR_H
82# define HAVE_WCHAR_H
83# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000084#endif
85
86#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000087/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
88# ifdef _HAVE_BSDI
89# include <time.h>
90# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000091# include "wchar.h"
92#endif
93
94#ifdef HAVE_USABLE_WCHAR_T
95
96/* If the compiler defines whcar_t as a 16-bit unsigned type we can
97 use the compiler type directly. Works fine with all modern Windows
98 platforms. */
99
100typedef wchar_t Py_UNICODE;
101
102#else
103
104/* Use if you have a standard ANSI compiler, without wchar_t support.
105 If a short is not 16 bits on your platform, you have to fix the
106 typedef below, or the module initialization code will complain. */
107
108typedef unsigned short Py_UNICODE;
109
110#endif
111
Marc-André Lemburg43279102000-07-07 09:01:41 +0000112/*
113 * Use this typedef when you need to represent a UTF-16 surrogate pair
114 * as single unsigned integer.
115 */
116#if SIZEOF_INT >= 4
117typedef unsigned int Py_UCS4;
118#elif SIZEOF_LONG >= 4
119typedef unsigned long Py_UCS4;
120#endif
121
122
Guido van Rossumd8225182000-03-10 22:33:05 +0000123/* --- Internal Unicode Operations ---------------------------------------- */
124
125/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000126 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
127 configure Python using --with-ctype-functions. This reduces the
128 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000129
130#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
131
132#include "wctype.h"
133
134#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
135
136#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
137#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
138#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
139#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
140
141#define Py_UNICODE_TOLOWER(ch) towlower(ch)
142#define Py_UNICODE_TOUPPER(ch) towupper(ch)
143#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
144
145#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
146#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
147#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
148
149#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
150#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
151#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
152
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000153#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
154
Guido van Rossumd8225182000-03-10 22:33:05 +0000155#else
156
157#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
158
159#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
160#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
161#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
162#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
163
164#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
165#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
166#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
167
168#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
169#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
170#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
171
172#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
173#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
174#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
175
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000176#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000177
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000178#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000179
180#define Py_UNICODE_ISALNUM(ch) \
181 (Py_UNICODE_ISALPHA(ch) || \
182 Py_UNICODE_ISDECIMAL(ch) || \
183 Py_UNICODE_ISDIGIT(ch) || \
184 Py_UNICODE_ISNUMERIC(ch))
185
Guido van Rossumd8225182000-03-10 22:33:05 +0000186#define Py_UNICODE_COPY(target, source, length)\
187 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
188
189#define Py_UNICODE_FILL(target, value, length) do\
190 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
191 while (0)
192
193#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000194 ((*((string)->str + (offset)) == *((substring)->str)) &&\
195 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000196 (substring)->length*sizeof(Py_UNICODE)))
197
Barry Warsaw51ac5802000-03-20 16:36:48 +0000198#ifdef __cplusplus
199extern "C" {
200#endif
201
Guido van Rossumd8225182000-03-10 22:33:05 +0000202/* --- Unicode Type ------------------------------------------------------- */
203
204typedef struct {
205 PyObject_HEAD
206 int length; /* Length of raw Unicode data in buffer */
207 Py_UNICODE *str; /* Raw Unicode buffer */
208 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000209 PyObject *defenc; /* (Default) Encoded version as Python
210 string, or NULL; this is used for
211 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000212} PyUnicodeObject;
213
214extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
215
216#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
217
218/* Fast access macros */
219#define PyUnicode_GET_SIZE(op) \
220 (((PyUnicodeObject *)(op))->length)
221#define PyUnicode_GET_DATA_SIZE(op) \
222 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
223#define PyUnicode_AS_UNICODE(op) \
224 (((PyUnicodeObject *)(op))->str)
225#define PyUnicode_AS_DATA(op) \
226 ((const char *)((PyUnicodeObject *)(op))->str)
227
228/* --- Constants ---------------------------------------------------------- */
229
230/* This Unicode character will be used as replacement character during
231 decoding if the errors argument is set to "replace". Note: the
232 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
233 Unicode 3.0. */
234
235#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
236
237/* === Public API ========================================================= */
238
239/* --- Plain Py_UNICODE --------------------------------------------------- */
240
241/* Create a Unicode Object from the Py_UNICODE buffer u of the given
242 size. u may be NULL which causes the contents to be undefined. It
243 is the user's responsibility to fill in the needed data.
244
245 The buffer is copied into the new object. */
246
247extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
248 const Py_UNICODE *u, /* Unicode buffer */
249 int size /* size of buffer */
250 );
251
252/* Return a read-only pointer to the Unicode object's internal
253 Py_UNICODE buffer. */
254
255extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
256 PyObject *unicode /* Unicode object */
257 );
258
259/* Get the length of the Unicode object. */
260
261extern DL_IMPORT(int) PyUnicode_GetSize(
262 PyObject *unicode /* Unicode object */
263 );
264
Guido van Rossum52c23592000-04-10 13:41:41 +0000265/* Resize an already allocated Unicode object to the new size length.
266
267 *unicode is modified to point to the new (resized) object and 0
268 returned on success.
269
270 This API may only be called by the function which also called the
271 Unicode constructor. The refcount on the object must be 1. Otherwise,
272 an error is returned.
273
274 Error handling is implemented as follows: an exception is set, -1
275 is returned and *unicode left untouched.
276
277*/
278
279extern DL_IMPORT(int) PyUnicode_Resize(
280 PyObject **unicode, /* Pointer to the Unicode object */
281 int length /* New length */
282 );
283
Guido van Rossumd8225182000-03-10 22:33:05 +0000284/* Coerce obj to an Unicode object and return a reference with
285 *incremented* refcount.
286
287 Coercion is done in the following way:
288
289 1. Unicode objects are passed back as-is with incremented
290 refcount.
291
292 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000293 under the assumptions that they contain data using the current
294 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000295
296 3. All other objects raise an exception.
297
298 The API returns NULL in case of an error. The caller is responsible
299 for decref'ing the returned objects.
300
301*/
302
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000303extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
304 register PyObject *obj, /* Object */
305 const char *encoding, /* encoding */
306 const char *errors /* error handling */
307 );
308
309/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
310 which results in using the default encoding as basis for
311 decoding the object.
312
313 Coerces obj to an Unicode object and return a reference with
314 *incremented* refcount.
315
316 The API returns NULL in case of an error. The caller is responsible
317 for decref'ing the returned objects.
318
319*/
320
Guido van Rossumd8225182000-03-10 22:33:05 +0000321extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
322 register PyObject *obj /* Object */
323 );
324
325/* --- wchar_t support for platforms which support it --------------------- */
326
327#ifdef HAVE_WCHAR_H
328
329/* Create a Unicode Object from the whcar_t buffer w of the given
330 size.
331
332 The buffer is copied into the new object. */
333
334extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
335 register const wchar_t *w, /* wchar_t buffer */
336 int size /* size of buffer */
337 );
338
339/* Copies the Unicode Object contents into the whcar_t buffer w. At
340 most size wchar_t characters are copied.
341
342 Returns the number of wchar_t characters copied or -1 in case of an
343 error. */
344
345extern DL_IMPORT(int) PyUnicode_AsWideChar(
346 PyUnicodeObject *unicode, /* Unicode object */
347 register wchar_t *w, /* wchar_t buffer */
348 int size /* size of buffer */
349 );
350
351#endif
352
353/* === Builtin Codecs =====================================================
354
355 Many of these APIs take two arguments encoding and errors. These
356 parameters encoding and errors have the same semantics as the ones
357 of the builtin unicode() API.
358
Fred Drakecb093fe2000-05-09 19:51:53 +0000359 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000360
361 Error handling is set by errors which may also be set to NULL
362 meaning to use the default handling defined for the codec. Default
363 error handling for all builtin codecs is "strict" (ValueErrors are
364 raised).
365
366 The codecs all use a similar interface. Only deviation from the
367 generic ones are documented.
368
369*/
370
Fred Drakecb093fe2000-05-09 19:51:53 +0000371/* --- Manage the default encoding ---------------------------------------- */
372
373/* Returns the currently active default encoding.
374
375 The default encoding is currently implemented as run-time settable
376 process global. This may change in future versions of the
377 interpreter to become a parameter which is managed on a per-thread
378 basis.
379
380 */
381
Thomas Wouters5f375912000-07-22 23:30:03 +0000382extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000383
384/* Sets the currently active default encoding.
385
386 Returns 0 on success, -1 in case of an error.
387
388 */
389
390extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
391 const char *encoding /* Encoding name in standard form */
392 );
393
Guido van Rossumd8225182000-03-10 22:33:05 +0000394/* --- Generic Codecs ----------------------------------------------------- */
395
396/* Create a Unicode object by decoding the encoded string s of the
397 given size. */
398
399extern DL_IMPORT(PyObject*) PyUnicode_Decode(
400 const char *s, /* encoded string */
401 int size, /* size of buffer */
402 const char *encoding, /* encoding */
403 const char *errors /* error handling */
404 );
405
406/* Encodes a Py_UNICODE buffer of the given size and returns a
407 Python string object. */
408
409extern DL_IMPORT(PyObject*) PyUnicode_Encode(
410 const Py_UNICODE *s, /* Unicode char buffer */
411 int size, /* number of Py_UNICODE chars to encode */
412 const char *encoding, /* encoding */
413 const char *errors /* error handling */
414 );
415
416/* Encodes a Unicode object and returns the result as Python string
417 object. */
418
419extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
420 PyObject *unicode, /* Unicode object */
421 const char *encoding, /* encoding */
422 const char *errors /* error handling */
423 );
424
425/* --- UTF-8 Codecs ------------------------------------------------------- */
426
427extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
428 const char *string, /* UTF-8 encoded string */
429 int length, /* size of string */
430 const char *errors /* error handling */
431 );
432
433extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
434 PyObject *unicode /* Unicode object */
435 );
436
437extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
438 const Py_UNICODE *data, /* Unicode char buffer */
439 int length, /* number of Py_UNICODE chars to encode */
440 const char *errors /* error handling */
441 );
442
443/* --- UTF-16 Codecs ------------------------------------------------------ */
444
Guido van Rossum9e896b32000-04-05 20:11:21 +0000445/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000446 the corresponding Unicode object.
447
448 errors (if non-NULL) defines the error handling. It defaults
449 to "strict".
450
451 If byteorder is non-NULL, the decoder starts decoding using the
452 given byte order:
453
454 *byteorder == -1: little endian
455 *byteorder == 0: native order
456 *byteorder == 1: big endian
457
458 and then switches according to all BOM marks it finds in the input
459 data. BOM marks are not copied into the resulting Unicode string.
460 After completion, *byteorder is set to the current byte order at
461 the end of input data.
462
463 If byteorder is NULL, the codec starts in native order mode.
464
465*/
466
467extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
468 const char *string, /* UTF-16 encoded string */
469 int length, /* size of string */
470 const char *errors, /* error handling */
471 int *byteorder /* pointer to byteorder to use
472 0=native;-1=LE,1=BE; updated on
473 exit */
474 );
475
476/* Returns a Python string using the UTF-16 encoding in native byte
477 order. The string always starts with a BOM mark. */
478
479extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
480 PyObject *unicode /* Unicode object */
481 );
482
483/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000484 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000485
486 If byteorder is not 0, output is written according to the following
487 byte order:
488
489 byteorder == -1: little endian
490 byteorder == 0: native byte order (writes a BOM mark)
491 byteorder == 1: big endian
492
493 If byteorder is 0, the output string will always start with the
494 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
495 prepended.
496
497 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
498 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000499 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000500
501*/
502
503extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
504 const Py_UNICODE *data, /* Unicode char buffer */
505 int length, /* number of Py_UNICODE chars to encode */
506 const char *errors, /* error handling */
507 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
508 );
509
510/* --- Unicode-Escape Codecs ---------------------------------------------- */
511
512extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
513 const char *string, /* Unicode-Escape encoded string */
514 int length, /* size of string */
515 const char *errors /* error handling */
516 );
517
518extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
519 PyObject *unicode /* Unicode object */
520 );
521
522extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
523 const Py_UNICODE *data, /* Unicode char buffer */
524 int length /* Number of Py_UNICODE chars to encode */
525 );
526
527/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
528
529extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
530 const char *string, /* Raw-Unicode-Escape encoded string */
531 int length, /* size of string */
532 const char *errors /* error handling */
533 );
534
535extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
536 PyObject *unicode /* Unicode object */
537 );
538
539extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
540 const Py_UNICODE *data, /* Unicode char buffer */
541 int length /* Number of Py_UNICODE chars to encode */
542 );
543
544/* --- Latin-1 Codecs -----------------------------------------------------
545
546 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
547
548*/
549
550extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
551 const char *string, /* Latin-1 encoded string */
552 int length, /* size of string */
553 const char *errors /* error handling */
554 );
555
556extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
557 PyObject *unicode /* Unicode object */
558 );
559
560extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
561 const Py_UNICODE *data, /* Unicode char buffer */
562 int length, /* Number of Py_UNICODE chars to encode */
563 const char *errors /* error handling */
564 );
565
566/* --- ASCII Codecs -------------------------------------------------------
567
568 Only 7-bit ASCII data is excepted. All other codes generate errors.
569
570*/
571
572extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
573 const char *string, /* ASCII encoded string */
574 int length, /* size of string */
575 const char *errors /* error handling */
576 );
577
578extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
579 PyObject *unicode /* Unicode object */
580 );
581
582extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
583 const Py_UNICODE *data, /* Unicode char buffer */
584 int length, /* Number of Py_UNICODE chars to encode */
585 const char *errors /* error handling */
586 );
587
588/* --- Character Map Codecs -----------------------------------------------
589
590 This codec uses mappings to encode and decode characters.
591
592 Decoding mappings must map single string characters to single
593 Unicode characters, integers (which are then interpreted as Unicode
594 ordinals) or None (meaning "undefined mapping" and causing an
595 error).
596
597 Encoding mappings must map single Unicode characters to single
598 string characters, integers (which are then interpreted as Latin-1
599 ordinals) or None (meaning "undefined mapping" and causing an
600 error).
601
602 If a character lookup fails with a LookupError, the character is
603 copied as-is meaning that its ordinal value will be interpreted as
604 Unicode or Latin-1 ordinal resp. Because of this mappings only need
605 to contain those mappings which map characters to different code
606 points.
607
608*/
609
610extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
611 const char *string, /* Encoded string */
612 int length, /* size of string */
613 PyObject *mapping, /* character mapping
614 (char ordinal -> unicode ordinal) */
615 const char *errors /* error handling */
616 );
617
618extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
619 PyObject *unicode, /* Unicode object */
620 PyObject *mapping /* character mapping
621 (unicode ordinal -> char ordinal) */
622 );
623
624extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
625 const Py_UNICODE *data, /* Unicode char buffer */
626 int length, /* Number of Py_UNICODE chars to encode */
627 PyObject *mapping, /* character mapping
628 (unicode ordinal -> char ordinal) */
629 const char *errors /* error handling */
630 );
631
632/* Translate a Py_UNICODE buffer of the given length by applying a
633 character mapping table to it and return the resulting Unicode
634 object.
635
636 The mapping table must map Unicode ordinal integers to Unicode
637 ordinal integers or None (causing deletion of the character).
638
639 Mapping tables may be dictionaries or sequences. Unmapped character
640 ordinals (ones which cause a LookupError) are left untouched and
641 are copied as-is.
642
643*/
644
645extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
646 const Py_UNICODE *data, /* Unicode char buffer */
647 int length, /* Number of Py_UNICODE chars to encode */
648 PyObject *table, /* Translate table */
649 const char *errors /* error handling */
650 );
651
Guido van Rossumefec1152000-03-28 02:01:15 +0000652#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000653
Guido van Rossumefec1152000-03-28 02:01:15 +0000654/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000655
Guido van Rossumefec1152000-03-28 02:01:15 +0000656extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
657 const char *string, /* MBCS encoded string */
658 int length, /* size of string */
659 const char *errors /* error handling */
660 );
661
662extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
663 PyObject *unicode /* Unicode object */
664 );
665
666extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
667 const Py_UNICODE *data, /* Unicode char buffer */
668 int length, /* Number of Py_UNICODE chars to encode */
669 const char *errors /* error handling */
670 );
671
Guido van Rossumefec1152000-03-28 02:01:15 +0000672#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000673
Guido van Rossum9e896b32000-04-05 20:11:21 +0000674/* --- Decimal Encoder ---------------------------------------------------- */
675
676/* Takes a Unicode string holding a decimal value and writes it into
677 an output buffer using standard ASCII digit codes.
678
679 The output buffer has to provide at least length+1 bytes of storage
680 area. The output string is 0-terminated.
681
682 The encoder converts whitespace to ' ', decimal characters to their
683 corresponding ASCII digit and all other Latin-1 characters except
684 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
685 are treated as errors. This includes embedded NULL bytes.
686
687 Error handling is defined by the errors argument:
688
689 NULL or "strict": raise a ValueError
690 "ignore": ignore the wrong characters (these are not copied to the
691 output buffer)
692 "replace": replaces illegal characters with '?'
693
694 Returns 0 on success, -1 on failure.
695
696*/
697
698extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
699 Py_UNICODE *s, /* Unicode buffer */
700 int length, /* Number of Py_UNICODE chars to encode */
701 char *output, /* Output buffer; must have size >= length */
702 const char *errors /* error handling */
703 );
704
Guido van Rossumd8225182000-03-10 22:33:05 +0000705/* --- Methods & Slots ----------------------------------------------------
706
707 These are capable of handling Unicode objects and strings on input
708 (we refer to them as strings in the descriptions) and return
709 Unicode objects or integers as apporpriate. */
710
711/* Concat two strings giving a new Unicode string. */
712
713extern DL_IMPORT(PyObject*) PyUnicode_Concat(
714 PyObject *left, /* Left string */
715 PyObject *right /* Right string */
716 );
717
718/* Split a string giving a list of Unicode strings.
719
720 If sep is NULL, splitting will be done at all whitespace
721 substrings. Otherwise, splits occur at the given separator.
722
723 At most maxsplit splits will be done. If negative, no limit is set.
724
725 Separators are not included in the resulting list.
726
727*/
728
729extern DL_IMPORT(PyObject*) PyUnicode_Split(
730 PyObject *s, /* String to split */
731 PyObject *sep, /* String separator */
732 int maxsplit /* Maxsplit count */
733 );
734
735/* Dito, but split at line breaks.
736
737 CRLF is considered to be one line break. Line breaks are not
738 included in the resulting list. */
739
740extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
741 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000742 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000743 );
744
745/* Translate a string by applying a character mapping table to it and
746 return the resulting Unicode object.
747
748 The mapping table must map Unicode ordinal integers to Unicode
749 ordinal integers or None (causing deletion of the character).
750
751 Mapping tables may be dictionaries or sequences. Unmapped character
752 ordinals (ones which cause a LookupError) are left untouched and
753 are copied as-is.
754
755*/
756
757extern DL_IMPORT(PyObject *) PyUnicode_Translate(
758 PyObject *str, /* String */
759 PyObject *table, /* Translate table */
760 const char *errors /* error handling */
761 );
762
763/* Join a sequence of strings using the given separator and return
764 the resulting Unicode string. */
765
766extern DL_IMPORT(PyObject*) PyUnicode_Join(
767 PyObject *separator, /* Separator string */
768 PyObject *seq /* Sequence object */
769 );
770
771/* Return 1 if substr matches str[start:end] at the given tail end, 0
772 otherwise. */
773
774extern DL_IMPORT(int) PyUnicode_Tailmatch(
775 PyObject *str, /* String */
776 PyObject *substr, /* Prefix or Suffix string */
777 int start, /* Start index */
778 int end, /* Stop index */
779 int direction /* Tail end: -1 prefix, +1 suffix */
780 );
781
782/* Return the first position of substr in str[start:end] using the
783 given search direction or -1 if not found. */
784
785extern DL_IMPORT(int) PyUnicode_Find(
786 PyObject *str, /* String */
787 PyObject *substr, /* Substring to find */
788 int start, /* Start index */
789 int end, /* Stop index */
790 int direction /* Find direction: +1 forward, -1 backward */
791 );
792
Barry Warsaw51ac5802000-03-20 16:36:48 +0000793/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000794
795extern DL_IMPORT(int) PyUnicode_Count(
796 PyObject *str, /* String */
797 PyObject *substr, /* Substring to count */
798 int start, /* Start index */
799 int end /* Stop index */
800 );
801
Barry Warsaw51ac5802000-03-20 16:36:48 +0000802/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000803 and return the resulting Unicode object. */
804
805extern DL_IMPORT(PyObject *) PyUnicode_Replace(
806 PyObject *str, /* String */
807 PyObject *substr, /* Substring to find */
808 PyObject *replstr, /* Substring to replace */
809 int maxcount /* Max. number of replacements to apply;
810 -1 = all */
811 );
812
813/* Compare two strings and return -1, 0, 1 for less than, equal,
814 greater than resp. */
815
816extern DL_IMPORT(int) PyUnicode_Compare(
817 PyObject *left, /* Left string */
818 PyObject *right /* Right string */
819 );
820
Thomas Wouters7e474022000-07-16 12:04:32 +0000821/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000822 the resulting Unicode string. */
823
824extern DL_IMPORT(PyObject *) PyUnicode_Format(
825 PyObject *format, /* Format string */
826 PyObject *args /* Argument tuple or dictionary */
827 );
828
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000829/* Checks whether element is contained in container and return 1/0
830 accordingly.
831
832 element has to coerce to an one element Unicode string. -1 is
833 returned in case of an error. */
834
835extern DL_IMPORT(int) PyUnicode_Contains(
836 PyObject *container, /* Container string */
837 PyObject *element /* Element string */
838 );
839
Guido van Rossumd8225182000-03-10 22:33:05 +0000840/* === Characters Type APIs =============================================== */
841
842/* These should not be used directly. Use the Py_UNICODE_IS* and
843 Py_UNICODE_TO* macros instead.
844
845 These APIs are implemented in Objects/unicodectype.c.
846
847*/
848
849extern DL_IMPORT(int) _PyUnicode_IsLowercase(
850 register const Py_UNICODE ch /* Unicode character */
851 );
852
853extern DL_IMPORT(int) _PyUnicode_IsUppercase(
854 register const Py_UNICODE ch /* Unicode character */
855 );
856
857extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
858 register const Py_UNICODE ch /* Unicode character */
859 );
860
861extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
862 register const Py_UNICODE ch /* Unicode character */
863 );
864
865extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
866 register const Py_UNICODE ch /* Unicode character */
867 );
868
869extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
870 register const Py_UNICODE ch /* Unicode character */
871 );
872
873extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
874 register const Py_UNICODE ch /* Unicode character */
875 );
876
877extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
878 register const Py_UNICODE ch /* Unicode character */
879 );
880
881extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
882 register const Py_UNICODE ch /* Unicode character */
883 );
884
885extern DL_IMPORT(int) _PyUnicode_ToDigit(
886 register const Py_UNICODE ch /* Unicode character */
887 );
888
889extern DL_IMPORT(double) _PyUnicode_ToNumeric(
890 register const Py_UNICODE ch /* Unicode character */
891 );
892
893extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
894 register const Py_UNICODE ch /* Unicode character */
895 );
896
897extern DL_IMPORT(int) _PyUnicode_IsDigit(
898 register const Py_UNICODE ch /* Unicode character */
899 );
900
901extern DL_IMPORT(int) _PyUnicode_IsNumeric(
902 register const Py_UNICODE ch /* Unicode character */
903 );
904
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000905extern DL_IMPORT(int) _PyUnicode_IsAlpha(
906 register const Py_UNICODE ch /* Unicode character */
907 );
908
Guido van Rossumd8225182000-03-10 22:33:05 +0000909#ifdef __cplusplus
910}
911#endif
912#endif /* !Py_UNICODEOBJECT_H */