blob: 74cb0334db84088517a822ca9127d765b20bd978 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000085/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87# include <time.h>
88# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000089# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95 use the compiler type directly. Works fine with all modern Windows
96 platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103 If a short is not 16 bits on your platform, you have to fix the
104 typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114 configure Python using --with-ctype-functions. This reduces the
115 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000140#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
141
Guido van Rossumd8225182000-03-10 22:33:05 +0000142#else
143
144#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
145
146#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
147#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
148#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
149#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
150
151#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
152#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
153#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
154
155#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
156#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
157#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
158
159#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
160#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
161#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
162
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000163#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000164
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000165#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000166
167#define Py_UNICODE_ISALNUM(ch) \
168 (Py_UNICODE_ISALPHA(ch) || \
169 Py_UNICODE_ISDECIMAL(ch) || \
170 Py_UNICODE_ISDIGIT(ch) || \
171 Py_UNICODE_ISNUMERIC(ch))
172
Guido van Rossumd8225182000-03-10 22:33:05 +0000173#define Py_UNICODE_COPY(target, source, length)\
174 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
175
176#define Py_UNICODE_FILL(target, value, length) do\
177 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
178 while (0)
179
180#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000181 ((*((string)->str + (offset)) == *((substring)->str)) &&\
182 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000183 (substring)->length*sizeof(Py_UNICODE)))
184
Barry Warsaw51ac5802000-03-20 16:36:48 +0000185#ifdef __cplusplus
186extern "C" {
187#endif
188
Guido van Rossumd8225182000-03-10 22:33:05 +0000189/* --- Unicode Type ------------------------------------------------------- */
190
191typedef struct {
192 PyObject_HEAD
193 int length; /* Length of raw Unicode data in buffer */
194 Py_UNICODE *str; /* Raw Unicode buffer */
195 long hash; /* Hash value; -1 if not set */
196 PyObject *utf8str; /* UTF-8 encoded version as Python string,
197 or NULL */
198} PyUnicodeObject;
199
200extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
201
202#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
203
204/* Fast access macros */
205#define PyUnicode_GET_SIZE(op) \
206 (((PyUnicodeObject *)(op))->length)
207#define PyUnicode_GET_DATA_SIZE(op) \
208 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
209#define PyUnicode_AS_UNICODE(op) \
210 (((PyUnicodeObject *)(op))->str)
211#define PyUnicode_AS_DATA(op) \
212 ((const char *)((PyUnicodeObject *)(op))->str)
213
214/* --- Constants ---------------------------------------------------------- */
215
216/* This Unicode character will be used as replacement character during
217 decoding if the errors argument is set to "replace". Note: the
218 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
219 Unicode 3.0. */
220
221#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
222
223/* === Public API ========================================================= */
224
225/* --- Plain Py_UNICODE --------------------------------------------------- */
226
227/* Create a Unicode Object from the Py_UNICODE buffer u of the given
228 size. u may be NULL which causes the contents to be undefined. It
229 is the user's responsibility to fill in the needed data.
230
231 The buffer is copied into the new object. */
232
233extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
234 const Py_UNICODE *u, /* Unicode buffer */
235 int size /* size of buffer */
236 );
237
238/* Return a read-only pointer to the Unicode object's internal
239 Py_UNICODE buffer. */
240
241extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
242 PyObject *unicode /* Unicode object */
243 );
244
245/* Get the length of the Unicode object. */
246
247extern DL_IMPORT(int) PyUnicode_GetSize(
248 PyObject *unicode /* Unicode object */
249 );
250
Guido van Rossum52c23592000-04-10 13:41:41 +0000251/* Resize an already allocated Unicode object to the new size length.
252
253 *unicode is modified to point to the new (resized) object and 0
254 returned on success.
255
256 This API may only be called by the function which also called the
257 Unicode constructor. The refcount on the object must be 1. Otherwise,
258 an error is returned.
259
260 Error handling is implemented as follows: an exception is set, -1
261 is returned and *unicode left untouched.
262
263*/
264
265extern DL_IMPORT(int) PyUnicode_Resize(
266 PyObject **unicode, /* Pointer to the Unicode object */
267 int length /* New length */
268 );
269
Guido van Rossumd8225182000-03-10 22:33:05 +0000270/* Coerce obj to an Unicode object and return a reference with
271 *incremented* refcount.
272
273 Coercion is done in the following way:
274
275 1. Unicode objects are passed back as-is with incremented
276 refcount.
277
278 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000279 under the assumptions that they contain data using the current
280 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000281
282 3. All other objects raise an exception.
283
284 The API returns NULL in case of an error. The caller is responsible
285 for decref'ing the returned objects.
286
287*/
288
289extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
290 register PyObject *obj /* Object */
291 );
292
293/* --- wchar_t support for platforms which support it --------------------- */
294
295#ifdef HAVE_WCHAR_H
296
297/* Create a Unicode Object from the whcar_t buffer w of the given
298 size.
299
300 The buffer is copied into the new object. */
301
302extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
303 register const wchar_t *w, /* wchar_t buffer */
304 int size /* size of buffer */
305 );
306
307/* Copies the Unicode Object contents into the whcar_t buffer w. At
308 most size wchar_t characters are copied.
309
310 Returns the number of wchar_t characters copied or -1 in case of an
311 error. */
312
313extern DL_IMPORT(int) PyUnicode_AsWideChar(
314 PyUnicodeObject *unicode, /* Unicode object */
315 register wchar_t *w, /* wchar_t buffer */
316 int size /* size of buffer */
317 );
318
319#endif
320
321/* === Builtin Codecs =====================================================
322
323 Many of these APIs take two arguments encoding and errors. These
324 parameters encoding and errors have the same semantics as the ones
325 of the builtin unicode() API.
326
Fred Drakecb093fe2000-05-09 19:51:53 +0000327 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000328
329 Error handling is set by errors which may also be set to NULL
330 meaning to use the default handling defined for the codec. Default
331 error handling for all builtin codecs is "strict" (ValueErrors are
332 raised).
333
334 The codecs all use a similar interface. Only deviation from the
335 generic ones are documented.
336
337*/
338
Fred Drakecb093fe2000-05-09 19:51:53 +0000339/* --- Manage the default encoding ---------------------------------------- */
340
341/* Returns the currently active default encoding.
342
343 The default encoding is currently implemented as run-time settable
344 process global. This may change in future versions of the
345 interpreter to become a parameter which is managed on a per-thread
346 basis.
347
348 */
349
350extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
351
352/* Sets the currently active default encoding.
353
354 Returns 0 on success, -1 in case of an error.
355
356 */
357
358extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
359 const char *encoding /* Encoding name in standard form */
360 );
361
Guido van Rossumd8225182000-03-10 22:33:05 +0000362/* --- Generic Codecs ----------------------------------------------------- */
363
364/* Create a Unicode object by decoding the encoded string s of the
365 given size. */
366
367extern DL_IMPORT(PyObject*) PyUnicode_Decode(
368 const char *s, /* encoded string */
369 int size, /* size of buffer */
370 const char *encoding, /* encoding */
371 const char *errors /* error handling */
372 );
373
374/* Encodes a Py_UNICODE buffer of the given size and returns a
375 Python string object. */
376
377extern DL_IMPORT(PyObject*) PyUnicode_Encode(
378 const Py_UNICODE *s, /* Unicode char buffer */
379 int size, /* number of Py_UNICODE chars to encode */
380 const char *encoding, /* encoding */
381 const char *errors /* error handling */
382 );
383
384/* Encodes a Unicode object and returns the result as Python string
385 object. */
386
387extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
388 PyObject *unicode, /* Unicode object */
389 const char *encoding, /* encoding */
390 const char *errors /* error handling */
391 );
392
393/* --- UTF-8 Codecs ------------------------------------------------------- */
394
395extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
396 const char *string, /* UTF-8 encoded string */
397 int length, /* size of string */
398 const char *errors /* error handling */
399 );
400
401extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
402 PyObject *unicode /* Unicode object */
403 );
404
405extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
406 const Py_UNICODE *data, /* Unicode char buffer */
407 int length, /* number of Py_UNICODE chars to encode */
408 const char *errors /* error handling */
409 );
410
411/* --- UTF-16 Codecs ------------------------------------------------------ */
412
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000414 the corresponding Unicode object.
415
416 errors (if non-NULL) defines the error handling. It defaults
417 to "strict".
418
419 If byteorder is non-NULL, the decoder starts decoding using the
420 given byte order:
421
422 *byteorder == -1: little endian
423 *byteorder == 0: native order
424 *byteorder == 1: big endian
425
426 and then switches according to all BOM marks it finds in the input
427 data. BOM marks are not copied into the resulting Unicode string.
428 After completion, *byteorder is set to the current byte order at
429 the end of input data.
430
431 If byteorder is NULL, the codec starts in native order mode.
432
433*/
434
435extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
436 const char *string, /* UTF-16 encoded string */
437 int length, /* size of string */
438 const char *errors, /* error handling */
439 int *byteorder /* pointer to byteorder to use
440 0=native;-1=LE,1=BE; updated on
441 exit */
442 );
443
444/* Returns a Python string using the UTF-16 encoding in native byte
445 order. The string always starts with a BOM mark. */
446
447extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
448 PyObject *unicode /* Unicode object */
449 );
450
451/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000452 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000453
454 If byteorder is not 0, output is written according to the following
455 byte order:
456
457 byteorder == -1: little endian
458 byteorder == 0: native byte order (writes a BOM mark)
459 byteorder == 1: big endian
460
461 If byteorder is 0, the output string will always start with the
462 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
463 prepended.
464
465 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
466 UCS-2. This trick makes it possible to add full UTF-16 capabilities
467 at a later point without comprimising the APIs.
468
469*/
470
471extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
472 const Py_UNICODE *data, /* Unicode char buffer */
473 int length, /* number of Py_UNICODE chars to encode */
474 const char *errors, /* error handling */
475 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
476 );
477
478/* --- Unicode-Escape Codecs ---------------------------------------------- */
479
480extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
481 const char *string, /* Unicode-Escape encoded string */
482 int length, /* size of string */
483 const char *errors /* error handling */
484 );
485
486extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
487 PyObject *unicode /* Unicode object */
488 );
489
490extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
491 const Py_UNICODE *data, /* Unicode char buffer */
492 int length /* Number of Py_UNICODE chars to encode */
493 );
494
495/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
496
497extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
498 const char *string, /* Raw-Unicode-Escape encoded string */
499 int length, /* size of string */
500 const char *errors /* error handling */
501 );
502
503extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
504 PyObject *unicode /* Unicode object */
505 );
506
507extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
508 const Py_UNICODE *data, /* Unicode char buffer */
509 int length /* Number of Py_UNICODE chars to encode */
510 );
511
512/* --- Latin-1 Codecs -----------------------------------------------------
513
514 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
515
516*/
517
518extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
519 const char *string, /* Latin-1 encoded string */
520 int length, /* size of string */
521 const char *errors /* error handling */
522 );
523
524extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
525 PyObject *unicode /* Unicode object */
526 );
527
528extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
529 const Py_UNICODE *data, /* Unicode char buffer */
530 int length, /* Number of Py_UNICODE chars to encode */
531 const char *errors /* error handling */
532 );
533
534/* --- ASCII Codecs -------------------------------------------------------
535
536 Only 7-bit ASCII data is excepted. All other codes generate errors.
537
538*/
539
540extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
541 const char *string, /* ASCII encoded string */
542 int length, /* size of string */
543 const char *errors /* error handling */
544 );
545
546extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
547 PyObject *unicode /* Unicode object */
548 );
549
550extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
551 const Py_UNICODE *data, /* Unicode char buffer */
552 int length, /* Number of Py_UNICODE chars to encode */
553 const char *errors /* error handling */
554 );
555
556/* --- Character Map Codecs -----------------------------------------------
557
558 This codec uses mappings to encode and decode characters.
559
560 Decoding mappings must map single string characters to single
561 Unicode characters, integers (which are then interpreted as Unicode
562 ordinals) or None (meaning "undefined mapping" and causing an
563 error).
564
565 Encoding mappings must map single Unicode characters to single
566 string characters, integers (which are then interpreted as Latin-1
567 ordinals) or None (meaning "undefined mapping" and causing an
568 error).
569
570 If a character lookup fails with a LookupError, the character is
571 copied as-is meaning that its ordinal value will be interpreted as
572 Unicode or Latin-1 ordinal resp. Because of this mappings only need
573 to contain those mappings which map characters to different code
574 points.
575
576*/
577
578extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
579 const char *string, /* Encoded string */
580 int length, /* size of string */
581 PyObject *mapping, /* character mapping
582 (char ordinal -> unicode ordinal) */
583 const char *errors /* error handling */
584 );
585
586extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
587 PyObject *unicode, /* Unicode object */
588 PyObject *mapping /* character mapping
589 (unicode ordinal -> char ordinal) */
590 );
591
592extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
593 const Py_UNICODE *data, /* Unicode char buffer */
594 int length, /* Number of Py_UNICODE chars to encode */
595 PyObject *mapping, /* character mapping
596 (unicode ordinal -> char ordinal) */
597 const char *errors /* error handling */
598 );
599
600/* Translate a Py_UNICODE buffer of the given length by applying a
601 character mapping table to it and return the resulting Unicode
602 object.
603
604 The mapping table must map Unicode ordinal integers to Unicode
605 ordinal integers or None (causing deletion of the character).
606
607 Mapping tables may be dictionaries or sequences. Unmapped character
608 ordinals (ones which cause a LookupError) are left untouched and
609 are copied as-is.
610
611*/
612
613extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
614 const Py_UNICODE *data, /* Unicode char buffer */
615 int length, /* Number of Py_UNICODE chars to encode */
616 PyObject *table, /* Translate table */
617 const char *errors /* error handling */
618 );
619
Guido van Rossumefec1152000-03-28 02:01:15 +0000620#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000621
Guido van Rossumefec1152000-03-28 02:01:15 +0000622/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000623
Guido van Rossumefec1152000-03-28 02:01:15 +0000624extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
625 const char *string, /* MBCS encoded string */
626 int length, /* size of string */
627 const char *errors /* error handling */
628 );
629
630extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
631 PyObject *unicode /* Unicode object */
632 );
633
634extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
635 const Py_UNICODE *data, /* Unicode char buffer */
636 int length, /* Number of Py_UNICODE chars to encode */
637 const char *errors /* error handling */
638 );
639
Guido van Rossumefec1152000-03-28 02:01:15 +0000640#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000641
Guido van Rossum9e896b32000-04-05 20:11:21 +0000642/* --- Decimal Encoder ---------------------------------------------------- */
643
644/* Takes a Unicode string holding a decimal value and writes it into
645 an output buffer using standard ASCII digit codes.
646
647 The output buffer has to provide at least length+1 bytes of storage
648 area. The output string is 0-terminated.
649
650 The encoder converts whitespace to ' ', decimal characters to their
651 corresponding ASCII digit and all other Latin-1 characters except
652 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
653 are treated as errors. This includes embedded NULL bytes.
654
655 Error handling is defined by the errors argument:
656
657 NULL or "strict": raise a ValueError
658 "ignore": ignore the wrong characters (these are not copied to the
659 output buffer)
660 "replace": replaces illegal characters with '?'
661
662 Returns 0 on success, -1 on failure.
663
664*/
665
666extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
667 Py_UNICODE *s, /* Unicode buffer */
668 int length, /* Number of Py_UNICODE chars to encode */
669 char *output, /* Output buffer; must have size >= length */
670 const char *errors /* error handling */
671 );
672
Guido van Rossumd8225182000-03-10 22:33:05 +0000673/* --- Methods & Slots ----------------------------------------------------
674
675 These are capable of handling Unicode objects and strings on input
676 (we refer to them as strings in the descriptions) and return
677 Unicode objects or integers as apporpriate. */
678
679/* Concat two strings giving a new Unicode string. */
680
681extern DL_IMPORT(PyObject*) PyUnicode_Concat(
682 PyObject *left, /* Left string */
683 PyObject *right /* Right string */
684 );
685
686/* Split a string giving a list of Unicode strings.
687
688 If sep is NULL, splitting will be done at all whitespace
689 substrings. Otherwise, splits occur at the given separator.
690
691 At most maxsplit splits will be done. If negative, no limit is set.
692
693 Separators are not included in the resulting list.
694
695*/
696
697extern DL_IMPORT(PyObject*) PyUnicode_Split(
698 PyObject *s, /* String to split */
699 PyObject *sep, /* String separator */
700 int maxsplit /* Maxsplit count */
701 );
702
703/* Dito, but split at line breaks.
704
705 CRLF is considered to be one line break. Line breaks are not
706 included in the resulting list. */
707
708extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
709 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000710 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000711 );
712
713/* Translate a string by applying a character mapping table to it and
714 return the resulting Unicode object.
715
716 The mapping table must map Unicode ordinal integers to Unicode
717 ordinal integers or None (causing deletion of the character).
718
719 Mapping tables may be dictionaries or sequences. Unmapped character
720 ordinals (ones which cause a LookupError) are left untouched and
721 are copied as-is.
722
723*/
724
725extern DL_IMPORT(PyObject *) PyUnicode_Translate(
726 PyObject *str, /* String */
727 PyObject *table, /* Translate table */
728 const char *errors /* error handling */
729 );
730
731/* Join a sequence of strings using the given separator and return
732 the resulting Unicode string. */
733
734extern DL_IMPORT(PyObject*) PyUnicode_Join(
735 PyObject *separator, /* Separator string */
736 PyObject *seq /* Sequence object */
737 );
738
739/* Return 1 if substr matches str[start:end] at the given tail end, 0
740 otherwise. */
741
742extern DL_IMPORT(int) PyUnicode_Tailmatch(
743 PyObject *str, /* String */
744 PyObject *substr, /* Prefix or Suffix string */
745 int start, /* Start index */
746 int end, /* Stop index */
747 int direction /* Tail end: -1 prefix, +1 suffix */
748 );
749
750/* Return the first position of substr in str[start:end] using the
751 given search direction or -1 if not found. */
752
753extern DL_IMPORT(int) PyUnicode_Find(
754 PyObject *str, /* String */
755 PyObject *substr, /* Substring to find */
756 int start, /* Start index */
757 int end, /* Stop index */
758 int direction /* Find direction: +1 forward, -1 backward */
759 );
760
Barry Warsaw51ac5802000-03-20 16:36:48 +0000761/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000762
763extern DL_IMPORT(int) PyUnicode_Count(
764 PyObject *str, /* String */
765 PyObject *substr, /* Substring to count */
766 int start, /* Start index */
767 int end /* Stop index */
768 );
769
Barry Warsaw51ac5802000-03-20 16:36:48 +0000770/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000771 and return the resulting Unicode object. */
772
773extern DL_IMPORT(PyObject *) PyUnicode_Replace(
774 PyObject *str, /* String */
775 PyObject *substr, /* Substring to find */
776 PyObject *replstr, /* Substring to replace */
777 int maxcount /* Max. number of replacements to apply;
778 -1 = all */
779 );
780
781/* Compare two strings and return -1, 0, 1 for less than, equal,
782 greater than resp. */
783
784extern DL_IMPORT(int) PyUnicode_Compare(
785 PyObject *left, /* Left string */
786 PyObject *right /* Right string */
787 );
788
789/* Apply a argument tuple or dictionar to a format string and return
790 the resulting Unicode string. */
791
792extern DL_IMPORT(PyObject *) PyUnicode_Format(
793 PyObject *format, /* Format string */
794 PyObject *args /* Argument tuple or dictionary */
795 );
796
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000797/* Checks whether element is contained in container and return 1/0
798 accordingly.
799
800 element has to coerce to an one element Unicode string. -1 is
801 returned in case of an error. */
802
803extern DL_IMPORT(int) PyUnicode_Contains(
804 PyObject *container, /* Container string */
805 PyObject *element /* Element string */
806 );
807
Guido van Rossumd8225182000-03-10 22:33:05 +0000808/* === Characters Type APIs =============================================== */
809
810/* These should not be used directly. Use the Py_UNICODE_IS* and
811 Py_UNICODE_TO* macros instead.
812
813 These APIs are implemented in Objects/unicodectype.c.
814
815*/
816
817extern DL_IMPORT(int) _PyUnicode_IsLowercase(
818 register const Py_UNICODE ch /* Unicode character */
819 );
820
821extern DL_IMPORT(int) _PyUnicode_IsUppercase(
822 register const Py_UNICODE ch /* Unicode character */
823 );
824
825extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
826 register const Py_UNICODE ch /* Unicode character */
827 );
828
829extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
830 register const Py_UNICODE ch /* Unicode character */
831 );
832
833extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
834 register const Py_UNICODE ch /* Unicode character */
835 );
836
837extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
838 register const Py_UNICODE ch /* Unicode character */
839 );
840
841extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
842 register const Py_UNICODE ch /* Unicode character */
843 );
844
845extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
846 register const Py_UNICODE ch /* Unicode character */
847 );
848
849extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
850 register const Py_UNICODE ch /* Unicode character */
851 );
852
853extern DL_IMPORT(int) _PyUnicode_ToDigit(
854 register const Py_UNICODE ch /* Unicode character */
855 );
856
857extern DL_IMPORT(double) _PyUnicode_ToNumeric(
858 register const Py_UNICODE ch /* Unicode character */
859 );
860
861extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
862 register const Py_UNICODE ch /* Unicode character */
863 );
864
865extern DL_IMPORT(int) _PyUnicode_IsDigit(
866 register const Py_UNICODE ch /* Unicode character */
867 );
868
869extern DL_IMPORT(int) _PyUnicode_IsNumeric(
870 register const Py_UNICODE ch /* Unicode character */
871 );
872
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000873extern DL_IMPORT(int) _PyUnicode_IsAlpha(
874 register const Py_UNICODE ch /* Unicode character */
875 );
876
Guido van Rossumd8225182000-03-10 22:33:05 +0000877#ifdef __cplusplus
878}
879#endif
880#endif /* !Py_UNICODEOBJECT_H */