blob: 988ea1b39ec8af35b826d40d685c1ad313c6452c [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000081# ifndef HAVE_WCHAR_H
82# define HAVE_WCHAR_H
83# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000084#endif
85
86#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000087/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
88# ifdef _HAVE_BSDI
89# include <time.h>
90# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000091# include "wchar.h"
92#endif
93
94#ifdef HAVE_USABLE_WCHAR_T
95
96/* If the compiler defines whcar_t as a 16-bit unsigned type we can
97 use the compiler type directly. Works fine with all modern Windows
98 platforms. */
99
100typedef wchar_t Py_UNICODE;
101
102#else
103
104/* Use if you have a standard ANSI compiler, without wchar_t support.
105 If a short is not 16 bits on your platform, you have to fix the
106 typedef below, or the module initialization code will complain. */
107
108typedef unsigned short Py_UNICODE;
109
110#endif
111
Marc-André Lemburg43279102000-07-07 09:01:41 +0000112/*
113 * Use this typedef when you need to represent a UTF-16 surrogate pair
114 * as single unsigned integer.
115 */
116#if SIZEOF_INT >= 4
117typedef unsigned int Py_UCS4;
118#elif SIZEOF_LONG >= 4
119typedef unsigned long Py_UCS4;
120#endif
121
122
Guido van Rossumd8225182000-03-10 22:33:05 +0000123/* --- Internal Unicode Operations ---------------------------------------- */
124
125/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000126 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
127 configure Python using --with-ctype-functions. This reduces the
128 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000129
130#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
131
132#include "wctype.h"
133
134#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
135
136#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
137#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
138#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
139#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
140
141#define Py_UNICODE_TOLOWER(ch) towlower(ch)
142#define Py_UNICODE_TOUPPER(ch) towupper(ch)
143#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
144
145#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
146#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
147#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
148
149#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
150#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
151#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
152
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000153#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
154
Guido van Rossumd8225182000-03-10 22:33:05 +0000155#else
156
157#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
158
159#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
160#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
161#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
162#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
163
164#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
165#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
166#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
167
168#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
169#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
170#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
171
172#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
173#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
174#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
175
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000176#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000177
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000178#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000179
180#define Py_UNICODE_ISALNUM(ch) \
181 (Py_UNICODE_ISALPHA(ch) || \
182 Py_UNICODE_ISDECIMAL(ch) || \
183 Py_UNICODE_ISDIGIT(ch) || \
184 Py_UNICODE_ISNUMERIC(ch))
185
Guido van Rossumd8225182000-03-10 22:33:05 +0000186#define Py_UNICODE_COPY(target, source, length)\
187 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
188
189#define Py_UNICODE_FILL(target, value, length) do\
190 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
191 while (0)
192
193#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000194 ((*((string)->str + (offset)) == *((substring)->str)) &&\
195 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000196 (substring)->length*sizeof(Py_UNICODE)))
197
Barry Warsaw51ac5802000-03-20 16:36:48 +0000198#ifdef __cplusplus
199extern "C" {
200#endif
201
Guido van Rossumd8225182000-03-10 22:33:05 +0000202/* --- Unicode Type ------------------------------------------------------- */
203
204typedef struct {
205 PyObject_HEAD
206 int length; /* Length of raw Unicode data in buffer */
207 Py_UNICODE *str; /* Raw Unicode buffer */
208 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000209 PyObject *defenc; /* (Default) Encoded version as Python
210 string, or NULL; this is used for
211 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000212} PyUnicodeObject;
213
214extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
215
216#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
217
218/* Fast access macros */
219#define PyUnicode_GET_SIZE(op) \
220 (((PyUnicodeObject *)(op))->length)
221#define PyUnicode_GET_DATA_SIZE(op) \
222 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
223#define PyUnicode_AS_UNICODE(op) \
224 (((PyUnicodeObject *)(op))->str)
225#define PyUnicode_AS_DATA(op) \
226 ((const char *)((PyUnicodeObject *)(op))->str)
227
228/* --- Constants ---------------------------------------------------------- */
229
230/* This Unicode character will be used as replacement character during
231 decoding if the errors argument is set to "replace". Note: the
232 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
233 Unicode 3.0. */
234
235#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
236
237/* === Public API ========================================================= */
238
239/* --- Plain Py_UNICODE --------------------------------------------------- */
240
241/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000242 size.
243
244 u may be NULL which causes the contents to be undefined. It is the
245 user's responsibility to fill in the needed data afterwards. Note
246 that modifying the Unicode object contents after construction is
247 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000248
249 The buffer is copied into the new object. */
250
251extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
252 const Py_UNICODE *u, /* Unicode buffer */
253 int size /* size of buffer */
254 );
255
256/* Return a read-only pointer to the Unicode object's internal
257 Py_UNICODE buffer. */
258
259extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
260 PyObject *unicode /* Unicode object */
261 );
262
263/* Get the length of the Unicode object. */
264
265extern DL_IMPORT(int) PyUnicode_GetSize(
266 PyObject *unicode /* Unicode object */
267 );
268
Guido van Rossum52c23592000-04-10 13:41:41 +0000269/* Resize an already allocated Unicode object to the new size length.
270
271 *unicode is modified to point to the new (resized) object and 0
272 returned on success.
273
274 This API may only be called by the function which also called the
275 Unicode constructor. The refcount on the object must be 1. Otherwise,
276 an error is returned.
277
278 Error handling is implemented as follows: an exception is set, -1
279 is returned and *unicode left untouched.
280
281*/
282
283extern DL_IMPORT(int) PyUnicode_Resize(
284 PyObject **unicode, /* Pointer to the Unicode object */
285 int length /* New length */
286 );
287
Guido van Rossumd8225182000-03-10 22:33:05 +0000288/* Coerce obj to an Unicode object and return a reference with
289 *incremented* refcount.
290
291 Coercion is done in the following way:
292
293 1. Unicode objects are passed back as-is with incremented
294 refcount.
295
296 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000297 under the assumptions that they contain data using the current
298 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000299
300 3. All other objects raise an exception.
301
302 The API returns NULL in case of an error. The caller is responsible
303 for decref'ing the returned objects.
304
305*/
306
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000307extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
308 register PyObject *obj, /* Object */
309 const char *encoding, /* encoding */
310 const char *errors /* error handling */
311 );
312
313/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
314 which results in using the default encoding as basis for
315 decoding the object.
316
317 Coerces obj to an Unicode object and return a reference with
318 *incremented* refcount.
319
320 The API returns NULL in case of an error. The caller is responsible
321 for decref'ing the returned objects.
322
323*/
324
Guido van Rossumd8225182000-03-10 22:33:05 +0000325extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
326 register PyObject *obj /* Object */
327 );
328
329/* --- wchar_t support for platforms which support it --------------------- */
330
331#ifdef HAVE_WCHAR_H
332
333/* Create a Unicode Object from the whcar_t buffer w of the given
334 size.
335
336 The buffer is copied into the new object. */
337
338extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
339 register const wchar_t *w, /* wchar_t buffer */
340 int size /* size of buffer */
341 );
342
343/* Copies the Unicode Object contents into the whcar_t buffer w. At
344 most size wchar_t characters are copied.
345
346 Returns the number of wchar_t characters copied or -1 in case of an
347 error. */
348
349extern DL_IMPORT(int) PyUnicode_AsWideChar(
350 PyUnicodeObject *unicode, /* Unicode object */
351 register wchar_t *w, /* wchar_t buffer */
352 int size /* size of buffer */
353 );
354
355#endif
356
357/* === Builtin Codecs =====================================================
358
359 Many of these APIs take two arguments encoding and errors. These
360 parameters encoding and errors have the same semantics as the ones
361 of the builtin unicode() API.
362
Fred Drakecb093fe2000-05-09 19:51:53 +0000363 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
365 Error handling is set by errors which may also be set to NULL
366 meaning to use the default handling defined for the codec. Default
367 error handling for all builtin codecs is "strict" (ValueErrors are
368 raised).
369
370 The codecs all use a similar interface. Only deviation from the
371 generic ones are documented.
372
373*/
374
Fred Drakecb093fe2000-05-09 19:51:53 +0000375/* --- Manage the default encoding ---------------------------------------- */
376
377/* Returns the currently active default encoding.
378
379 The default encoding is currently implemented as run-time settable
380 process global. This may change in future versions of the
381 interpreter to become a parameter which is managed on a per-thread
382 basis.
383
384 */
385
Thomas Wouters5f375912000-07-22 23:30:03 +0000386extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000387
388/* Sets the currently active default encoding.
389
390 Returns 0 on success, -1 in case of an error.
391
392 */
393
394extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
395 const char *encoding /* Encoding name in standard form */
396 );
397
Guido van Rossumd8225182000-03-10 22:33:05 +0000398/* --- Generic Codecs ----------------------------------------------------- */
399
400/* Create a Unicode object by decoding the encoded string s of the
401 given size. */
402
403extern DL_IMPORT(PyObject*) PyUnicode_Decode(
404 const char *s, /* encoded string */
405 int size, /* size of buffer */
406 const char *encoding, /* encoding */
407 const char *errors /* error handling */
408 );
409
410/* Encodes a Py_UNICODE buffer of the given size and returns a
411 Python string object. */
412
413extern DL_IMPORT(PyObject*) PyUnicode_Encode(
414 const Py_UNICODE *s, /* Unicode char buffer */
415 int size, /* number of Py_UNICODE chars to encode */
416 const char *encoding, /* encoding */
417 const char *errors /* error handling */
418 );
419
420/* Encodes a Unicode object and returns the result as Python string
421 object. */
422
423extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
424 PyObject *unicode, /* Unicode object */
425 const char *encoding, /* encoding */
426 const char *errors /* error handling */
427 );
428
429/* --- UTF-8 Codecs ------------------------------------------------------- */
430
431extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
432 const char *string, /* UTF-8 encoded string */
433 int length, /* size of string */
434 const char *errors /* error handling */
435 );
436
437extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
438 PyObject *unicode /* Unicode object */
439 );
440
441extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
442 const Py_UNICODE *data, /* Unicode char buffer */
443 int length, /* number of Py_UNICODE chars to encode */
444 const char *errors /* error handling */
445 );
446
447/* --- UTF-16 Codecs ------------------------------------------------------ */
448
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000450 the corresponding Unicode object.
451
452 errors (if non-NULL) defines the error handling. It defaults
453 to "strict".
454
455 If byteorder is non-NULL, the decoder starts decoding using the
456 given byte order:
457
458 *byteorder == -1: little endian
459 *byteorder == 0: native order
460 *byteorder == 1: big endian
461
462 and then switches according to all BOM marks it finds in the input
463 data. BOM marks are not copied into the resulting Unicode string.
464 After completion, *byteorder is set to the current byte order at
465 the end of input data.
466
467 If byteorder is NULL, the codec starts in native order mode.
468
469*/
470
471extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
472 const char *string, /* UTF-16 encoded string */
473 int length, /* size of string */
474 const char *errors, /* error handling */
475 int *byteorder /* pointer to byteorder to use
476 0=native;-1=LE,1=BE; updated on
477 exit */
478 );
479
480/* Returns a Python string using the UTF-16 encoding in native byte
481 order. The string always starts with a BOM mark. */
482
483extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
484 PyObject *unicode /* Unicode object */
485 );
486
487/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000488 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000489
490 If byteorder is not 0, output is written according to the following
491 byte order:
492
493 byteorder == -1: little endian
494 byteorder == 0: native byte order (writes a BOM mark)
495 byteorder == 1: big endian
496
497 If byteorder is 0, the output string will always start with the
498 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
499 prepended.
500
501 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
502 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000503 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000504
505*/
506
507extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
508 const Py_UNICODE *data, /* Unicode char buffer */
509 int length, /* number of Py_UNICODE chars to encode */
510 const char *errors, /* error handling */
511 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
512 );
513
514/* --- Unicode-Escape Codecs ---------------------------------------------- */
515
516extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
517 const char *string, /* Unicode-Escape encoded string */
518 int length, /* size of string */
519 const char *errors /* error handling */
520 );
521
522extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
523 PyObject *unicode /* Unicode object */
524 );
525
526extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
527 const Py_UNICODE *data, /* Unicode char buffer */
528 int length /* Number of Py_UNICODE chars to encode */
529 );
530
531/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
532
533extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
534 const char *string, /* Raw-Unicode-Escape encoded string */
535 int length, /* size of string */
536 const char *errors /* error handling */
537 );
538
539extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
540 PyObject *unicode /* Unicode object */
541 );
542
543extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
544 const Py_UNICODE *data, /* Unicode char buffer */
545 int length /* Number of Py_UNICODE chars to encode */
546 );
547
548/* --- Latin-1 Codecs -----------------------------------------------------
549
550 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
551
552*/
553
554extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
555 const char *string, /* Latin-1 encoded string */
556 int length, /* size of string */
557 const char *errors /* error handling */
558 );
559
560extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
561 PyObject *unicode /* Unicode object */
562 );
563
564extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
565 const Py_UNICODE *data, /* Unicode char buffer */
566 int length, /* Number of Py_UNICODE chars to encode */
567 const char *errors /* error handling */
568 );
569
570/* --- ASCII Codecs -------------------------------------------------------
571
572 Only 7-bit ASCII data is excepted. All other codes generate errors.
573
574*/
575
576extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
577 const char *string, /* ASCII encoded string */
578 int length, /* size of string */
579 const char *errors /* error handling */
580 );
581
582extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
583 PyObject *unicode /* Unicode object */
584 );
585
586extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
587 const Py_UNICODE *data, /* Unicode char buffer */
588 int length, /* Number of Py_UNICODE chars to encode */
589 const char *errors /* error handling */
590 );
591
592/* --- Character Map Codecs -----------------------------------------------
593
594 This codec uses mappings to encode and decode characters.
595
596 Decoding mappings must map single string characters to single
597 Unicode characters, integers (which are then interpreted as Unicode
598 ordinals) or None (meaning "undefined mapping" and causing an
599 error).
600
601 Encoding mappings must map single Unicode characters to single
602 string characters, integers (which are then interpreted as Latin-1
603 ordinals) or None (meaning "undefined mapping" and causing an
604 error).
605
606 If a character lookup fails with a LookupError, the character is
607 copied as-is meaning that its ordinal value will be interpreted as
608 Unicode or Latin-1 ordinal resp. Because of this mappings only need
609 to contain those mappings which map characters to different code
610 points.
611
612*/
613
614extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
615 const char *string, /* Encoded string */
616 int length, /* size of string */
617 PyObject *mapping, /* character mapping
618 (char ordinal -> unicode ordinal) */
619 const char *errors /* error handling */
620 );
621
622extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
623 PyObject *unicode, /* Unicode object */
624 PyObject *mapping /* character mapping
625 (unicode ordinal -> char ordinal) */
626 );
627
628extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
629 const Py_UNICODE *data, /* Unicode char buffer */
630 int length, /* Number of Py_UNICODE chars to encode */
631 PyObject *mapping, /* character mapping
632 (unicode ordinal -> char ordinal) */
633 const char *errors /* error handling */
634 );
635
636/* Translate a Py_UNICODE buffer of the given length by applying a
637 character mapping table to it and return the resulting Unicode
638 object.
639
640 The mapping table must map Unicode ordinal integers to Unicode
641 ordinal integers or None (causing deletion of the character).
642
643 Mapping tables may be dictionaries or sequences. Unmapped character
644 ordinals (ones which cause a LookupError) are left untouched and
645 are copied as-is.
646
647*/
648
649extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
650 const Py_UNICODE *data, /* Unicode char buffer */
651 int length, /* Number of Py_UNICODE chars to encode */
652 PyObject *table, /* Translate table */
653 const char *errors /* error handling */
654 );
655
Guido van Rossumefec1152000-03-28 02:01:15 +0000656#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000657
Guido van Rossumefec1152000-03-28 02:01:15 +0000658/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000659
Guido van Rossumefec1152000-03-28 02:01:15 +0000660extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
661 const char *string, /* MBCS encoded string */
662 int length, /* size of string */
663 const char *errors /* error handling */
664 );
665
666extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
667 PyObject *unicode /* Unicode object */
668 );
669
670extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
671 const Py_UNICODE *data, /* Unicode char buffer */
672 int length, /* Number of Py_UNICODE chars to encode */
673 const char *errors /* error handling */
674 );
675
Guido van Rossumefec1152000-03-28 02:01:15 +0000676#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000677
Guido van Rossum9e896b32000-04-05 20:11:21 +0000678/* --- Decimal Encoder ---------------------------------------------------- */
679
680/* Takes a Unicode string holding a decimal value and writes it into
681 an output buffer using standard ASCII digit codes.
682
683 The output buffer has to provide at least length+1 bytes of storage
684 area. The output string is 0-terminated.
685
686 The encoder converts whitespace to ' ', decimal characters to their
687 corresponding ASCII digit and all other Latin-1 characters except
688 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
689 are treated as errors. This includes embedded NULL bytes.
690
691 Error handling is defined by the errors argument:
692
693 NULL or "strict": raise a ValueError
694 "ignore": ignore the wrong characters (these are not copied to the
695 output buffer)
696 "replace": replaces illegal characters with '?'
697
698 Returns 0 on success, -1 on failure.
699
700*/
701
702extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
703 Py_UNICODE *s, /* Unicode buffer */
704 int length, /* Number of Py_UNICODE chars to encode */
705 char *output, /* Output buffer; must have size >= length */
706 const char *errors /* error handling */
707 );
708
Guido van Rossumd8225182000-03-10 22:33:05 +0000709/* --- Methods & Slots ----------------------------------------------------
710
711 These are capable of handling Unicode objects and strings on input
712 (we refer to them as strings in the descriptions) and return
713 Unicode objects or integers as apporpriate. */
714
715/* Concat two strings giving a new Unicode string. */
716
717extern DL_IMPORT(PyObject*) PyUnicode_Concat(
718 PyObject *left, /* Left string */
719 PyObject *right /* Right string */
720 );
721
722/* Split a string giving a list of Unicode strings.
723
724 If sep is NULL, splitting will be done at all whitespace
725 substrings. Otherwise, splits occur at the given separator.
726
727 At most maxsplit splits will be done. If negative, no limit is set.
728
729 Separators are not included in the resulting list.
730
731*/
732
733extern DL_IMPORT(PyObject*) PyUnicode_Split(
734 PyObject *s, /* String to split */
735 PyObject *sep, /* String separator */
736 int maxsplit /* Maxsplit count */
737 );
738
739/* Dito, but split at line breaks.
740
741 CRLF is considered to be one line break. Line breaks are not
742 included in the resulting list. */
743
744extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
745 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000746 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000747 );
748
749/* Translate a string by applying a character mapping table to it and
750 return the resulting Unicode object.
751
752 The mapping table must map Unicode ordinal integers to Unicode
753 ordinal integers or None (causing deletion of the character).
754
755 Mapping tables may be dictionaries or sequences. Unmapped character
756 ordinals (ones which cause a LookupError) are left untouched and
757 are copied as-is.
758
759*/
760
761extern DL_IMPORT(PyObject *) PyUnicode_Translate(
762 PyObject *str, /* String */
763 PyObject *table, /* Translate table */
764 const char *errors /* error handling */
765 );
766
767/* Join a sequence of strings using the given separator and return
768 the resulting Unicode string. */
769
770extern DL_IMPORT(PyObject*) PyUnicode_Join(
771 PyObject *separator, /* Separator string */
772 PyObject *seq /* Sequence object */
773 );
774
775/* Return 1 if substr matches str[start:end] at the given tail end, 0
776 otherwise. */
777
778extern DL_IMPORT(int) PyUnicode_Tailmatch(
779 PyObject *str, /* String */
780 PyObject *substr, /* Prefix or Suffix string */
781 int start, /* Start index */
782 int end, /* Stop index */
783 int direction /* Tail end: -1 prefix, +1 suffix */
784 );
785
786/* Return the first position of substr in str[start:end] using the
787 given search direction or -1 if not found. */
788
789extern DL_IMPORT(int) PyUnicode_Find(
790 PyObject *str, /* String */
791 PyObject *substr, /* Substring to find */
792 int start, /* Start index */
793 int end, /* Stop index */
794 int direction /* Find direction: +1 forward, -1 backward */
795 );
796
Barry Warsaw51ac5802000-03-20 16:36:48 +0000797/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000798
799extern DL_IMPORT(int) PyUnicode_Count(
800 PyObject *str, /* String */
801 PyObject *substr, /* Substring to count */
802 int start, /* Start index */
803 int end /* Stop index */
804 );
805
Barry Warsaw51ac5802000-03-20 16:36:48 +0000806/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000807 and return the resulting Unicode object. */
808
809extern DL_IMPORT(PyObject *) PyUnicode_Replace(
810 PyObject *str, /* String */
811 PyObject *substr, /* Substring to find */
812 PyObject *replstr, /* Substring to replace */
813 int maxcount /* Max. number of replacements to apply;
814 -1 = all */
815 );
816
817/* Compare two strings and return -1, 0, 1 for less than, equal,
818 greater than resp. */
819
820extern DL_IMPORT(int) PyUnicode_Compare(
821 PyObject *left, /* Left string */
822 PyObject *right /* Right string */
823 );
824
Thomas Wouters7e474022000-07-16 12:04:32 +0000825/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000826 the resulting Unicode string. */
827
828extern DL_IMPORT(PyObject *) PyUnicode_Format(
829 PyObject *format, /* Format string */
830 PyObject *args /* Argument tuple or dictionary */
831 );
832
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000833/* Checks whether element is contained in container and return 1/0
834 accordingly.
835
836 element has to coerce to an one element Unicode string. -1 is
837 returned in case of an error. */
838
839extern DL_IMPORT(int) PyUnicode_Contains(
840 PyObject *container, /* Container string */
841 PyObject *element /* Element string */
842 );
843
Guido van Rossumd8225182000-03-10 22:33:05 +0000844/* === Characters Type APIs =============================================== */
845
846/* These should not be used directly. Use the Py_UNICODE_IS* and
847 Py_UNICODE_TO* macros instead.
848
849 These APIs are implemented in Objects/unicodectype.c.
850
851*/
852
853extern DL_IMPORT(int) _PyUnicode_IsLowercase(
854 register const Py_UNICODE ch /* Unicode character */
855 );
856
857extern DL_IMPORT(int) _PyUnicode_IsUppercase(
858 register const Py_UNICODE ch /* Unicode character */
859 );
860
861extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
862 register const Py_UNICODE ch /* Unicode character */
863 );
864
865extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
866 register const Py_UNICODE ch /* Unicode character */
867 );
868
869extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
870 register const Py_UNICODE ch /* Unicode character */
871 );
872
873extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
874 register const Py_UNICODE ch /* Unicode character */
875 );
876
877extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
878 register const Py_UNICODE ch /* Unicode character */
879 );
880
881extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
882 register const Py_UNICODE ch /* Unicode character */
883 );
884
885extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
886 register const Py_UNICODE ch /* Unicode character */
887 );
888
889extern DL_IMPORT(int) _PyUnicode_ToDigit(
890 register const Py_UNICODE ch /* Unicode character */
891 );
892
893extern DL_IMPORT(double) _PyUnicode_ToNumeric(
894 register const Py_UNICODE ch /* Unicode character */
895 );
896
897extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
898 register const Py_UNICODE ch /* Unicode character */
899 );
900
901extern DL_IMPORT(int) _PyUnicode_IsDigit(
902 register const Py_UNICODE ch /* Unicode character */
903 );
904
905extern DL_IMPORT(int) _PyUnicode_IsNumeric(
906 register const Py_UNICODE ch /* Unicode character */
907 );
908
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000909extern DL_IMPORT(int) _PyUnicode_IsAlpha(
910 register const Py_UNICODE ch /* Unicode character */
911 );
912
Guido van Rossumd8225182000-03-10 22:33:05 +0000913#ifdef __cplusplus
914}
915#endif
916#endif /* !Py_UNICODEOBJECT_H */