blob: d89537fc91edbba59156c4939f2d2b30cdd19803 [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
Fredrik Lundh9b14ab32001-06-26 22:59:49 +000061/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
62 properly set, but the default rules below doesn't set it. I'll
63 sort this out some other day -- fredrik@pythonware.com */
64
65#ifndef Py_UNICODE_SIZE
66#error Must define Py_UNICODE_SIZE
67#endif
68
Fredrik Lundh1294ad02001-06-26 17:17:07 +000069/* experimental UCS-4 support. enable at your own risk! */
70#undef USE_UCS4_STORAGE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000071#if Py_UNICODE_SIZE == 4
72#define USE_UCS4_STORAGE
73#endif
Fredrik Lundh1294ad02001-06-26 17:17:07 +000074
Guido van Rossumd8225182000-03-10 22:33:05 +000075/* Set these flags if the platform has "wchar.h", "wctype.h" and the
76 wchar_t type is a 16-bit unsigned type */
77/* #define HAVE_WCHAR_H */
78/* #define HAVE_USABLE_WCHAR_T */
79
80/* Defaults for various platforms */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000081#ifndef PY_UNICODE_TYPE
Guido van Rossumd8225182000-03-10 22:33:05 +000082
Fredrik Lundh1294ad02001-06-26 17:17:07 +000083/* Windows has a usable wchar_t type (unless we're using UCS-4) */
84# if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
Guido van Rossumd8225182000-03-10 22:33:05 +000085# define HAVE_USABLE_WCHAR_T
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +000086# define PY_UNICODE_TYPE wchar_t
87# endif
88
89# if defined(USE_UCS4_STORAGE)
90# define PY_UNICODE_TYPE Py_UCS4
Guido van Rossumd8225182000-03-10 22:33:05 +000091# endif
92
93#endif
94
95/* If the compiler provides a wchar_t type we try to support it
96 through the interface functions PyUnicode_FromWideChar() and
97 PyUnicode_AsWideChar(). */
98
99#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +0000100# ifndef HAVE_WCHAR_H
101# define HAVE_WCHAR_H
102# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000103#endif
104
105#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +0000106/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
107# ifdef _HAVE_BSDI
108# include <time.h>
109# endif
Guido van Rossumd8225182000-03-10 22:33:05 +0000110# include "wchar.h"
111#endif
112
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000113/*
114 * Use this typedef when you need to represent a UTF-16 surrogate pair
115 * as single unsigned integer.
116 */
117#if SIZEOF_INT >= 4
118typedef unsigned int Py_UCS4;
119#elif SIZEOF_LONG >= 4
120typedef unsigned long Py_UCS4;
Guido van Rossumd8225182000-03-10 22:33:05 +0000121#endif
122
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000123#if SIZEOF_SHORT == 2
124typedef unsigned short Py_UCS2;
125#else
126#error Cannot find a two-byte type
127#endif
Marc-André Lemburg43279102000-07-07 09:01:41 +0000128
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000129typedef PY_UNICODE_TYPE Py_UNICODE;
Marc-André Lemburg43279102000-07-07 09:01:41 +0000130
Guido van Rossumd8225182000-03-10 22:33:05 +0000131/* --- Internal Unicode Operations ---------------------------------------- */
132
133/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000134 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
135 configure Python using --with-ctype-functions. This reduces the
136 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000137
138#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
139
140#include "wctype.h"
141
142#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
145#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) towlower(ch)
150#define Py_UNICODE_TOUPPER(ch) towupper(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000161#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
162
Guido van Rossumd8225182000-03-10 22:33:05 +0000163#else
164
165#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
166
167#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
168#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
169#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
170#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
171
172#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
173#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
174#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
175
176#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
177#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
178#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
179
180#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
181#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
182#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
183
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000184#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000185
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000186#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000187
188#define Py_UNICODE_ISALNUM(ch) \
189 (Py_UNICODE_ISALPHA(ch) || \
190 Py_UNICODE_ISDECIMAL(ch) || \
191 Py_UNICODE_ISDIGIT(ch) || \
192 Py_UNICODE_ISNUMERIC(ch))
193
Guido van Rossumd8225182000-03-10 22:33:05 +0000194#define Py_UNICODE_COPY(target, source, length)\
195 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
196
197#define Py_UNICODE_FILL(target, value, length) do\
198 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
199 while (0)
200
201#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000202 ((*((string)->str + (offset)) == *((substring)->str)) &&\
203 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000204 (substring)->length*sizeof(Py_UNICODE)))
205
Barry Warsaw51ac5802000-03-20 16:36:48 +0000206#ifdef __cplusplus
207extern "C" {
208#endif
209
Guido van Rossumd8225182000-03-10 22:33:05 +0000210/* --- Unicode Type ------------------------------------------------------- */
211
212typedef struct {
213 PyObject_HEAD
214 int length; /* Length of raw Unicode data in buffer */
215 Py_UNICODE *str; /* Raw Unicode buffer */
216 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 PyObject *defenc; /* (Default) Encoded version as Python
218 string, or NULL; this is used for
219 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000220} PyUnicodeObject;
221
222extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
223
224#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
225
226/* Fast access macros */
227#define PyUnicode_GET_SIZE(op) \
228 (((PyUnicodeObject *)(op))->length)
229#define PyUnicode_GET_DATA_SIZE(op) \
230 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
231#define PyUnicode_AS_UNICODE(op) \
232 (((PyUnicodeObject *)(op))->str)
233#define PyUnicode_AS_DATA(op) \
234 ((const char *)((PyUnicodeObject *)(op))->str)
235
236/* --- Constants ---------------------------------------------------------- */
237
238/* This Unicode character will be used as replacement character during
239 decoding if the errors argument is set to "replace". Note: the
240 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
241 Unicode 3.0. */
242
243#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
244
245/* === Public API ========================================================= */
246
247/* --- Plain Py_UNICODE --------------------------------------------------- */
248
249/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 size.
251
252 u may be NULL which causes the contents to be undefined. It is the
253 user's responsibility to fill in the needed data afterwards. Note
254 that modifying the Unicode object contents after construction is
255 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000256
257 The buffer is copied into the new object. */
258
259extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
260 const Py_UNICODE *u, /* Unicode buffer */
261 int size /* size of buffer */
262 );
263
264/* Return a read-only pointer to the Unicode object's internal
265 Py_UNICODE buffer. */
266
267extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
268 PyObject *unicode /* Unicode object */
269 );
270
271/* Get the length of the Unicode object. */
272
273extern DL_IMPORT(int) PyUnicode_GetSize(
274 PyObject *unicode /* Unicode object */
275 );
276
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000277/* Get the maximum ordinal for a Unicode character. */
278extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
279
Guido van Rossum52c23592000-04-10 13:41:41 +0000280/* Resize an already allocated Unicode object to the new size length.
281
282 *unicode is modified to point to the new (resized) object and 0
283 returned on success.
284
285 This API may only be called by the function which also called the
286 Unicode constructor. The refcount on the object must be 1. Otherwise,
287 an error is returned.
288
289 Error handling is implemented as follows: an exception is set, -1
290 is returned and *unicode left untouched.
291
292*/
293
294extern DL_IMPORT(int) PyUnicode_Resize(
295 PyObject **unicode, /* Pointer to the Unicode object */
296 int length /* New length */
297 );
298
Guido van Rossumd8225182000-03-10 22:33:05 +0000299/* Coerce obj to an Unicode object and return a reference with
300 *incremented* refcount.
301
302 Coercion is done in the following way:
303
304 1. Unicode objects are passed back as-is with incremented
305 refcount.
306
307 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000308 under the assumptions that they contain data using the current
309 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000310
311 3. All other objects raise an exception.
312
313 The API returns NULL in case of an error. The caller is responsible
314 for decref'ing the returned objects.
315
316*/
317
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000318extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
319 register PyObject *obj, /* Object */
320 const char *encoding, /* encoding */
321 const char *errors /* error handling */
322 );
323
324/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
325 which results in using the default encoding as basis for
326 decoding the object.
327
328 Coerces obj to an Unicode object and return a reference with
329 *incremented* refcount.
330
331 The API returns NULL in case of an error. The caller is responsible
332 for decref'ing the returned objects.
333
334*/
335
Guido van Rossumd8225182000-03-10 22:33:05 +0000336extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
337 register PyObject *obj /* Object */
338 );
339
340/* --- wchar_t support for platforms which support it --------------------- */
341
342#ifdef HAVE_WCHAR_H
343
344/* Create a Unicode Object from the whcar_t buffer w of the given
345 size.
346
347 The buffer is copied into the new object. */
348
349extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
350 register const wchar_t *w, /* wchar_t buffer */
351 int size /* size of buffer */
352 );
353
354/* Copies the Unicode Object contents into the whcar_t buffer w. At
355 most size wchar_t characters are copied.
356
357 Returns the number of wchar_t characters copied or -1 in case of an
358 error. */
359
360extern DL_IMPORT(int) PyUnicode_AsWideChar(
361 PyUnicodeObject *unicode, /* Unicode object */
362 register wchar_t *w, /* wchar_t buffer */
363 int size /* size of buffer */
364 );
365
366#endif
367
368/* === Builtin Codecs =====================================================
369
370 Many of these APIs take two arguments encoding and errors. These
371 parameters encoding and errors have the same semantics as the ones
372 of the builtin unicode() API.
373
Fred Drakecb093fe2000-05-09 19:51:53 +0000374 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000375
376 Error handling is set by errors which may also be set to NULL
377 meaning to use the default handling defined for the codec. Default
378 error handling for all builtin codecs is "strict" (ValueErrors are
379 raised).
380
381 The codecs all use a similar interface. Only deviation from the
382 generic ones are documented.
383
384*/
385
Fred Drakecb093fe2000-05-09 19:51:53 +0000386/* --- Manage the default encoding ---------------------------------------- */
387
388/* Returns the currently active default encoding.
389
390 The default encoding is currently implemented as run-time settable
391 process global. This may change in future versions of the
392 interpreter to become a parameter which is managed on a per-thread
393 basis.
394
395 */
396
Thomas Wouters5f375912000-07-22 23:30:03 +0000397extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000398
399/* Sets the currently active default encoding.
400
401 Returns 0 on success, -1 in case of an error.
402
403 */
404
405extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
406 const char *encoding /* Encoding name in standard form */
407 );
408
Guido van Rossumd8225182000-03-10 22:33:05 +0000409/* --- Generic Codecs ----------------------------------------------------- */
410
411/* Create a Unicode object by decoding the encoded string s of the
412 given size. */
413
414extern DL_IMPORT(PyObject*) PyUnicode_Decode(
415 const char *s, /* encoded string */
416 int size, /* size of buffer */
417 const char *encoding, /* encoding */
418 const char *errors /* error handling */
419 );
420
421/* Encodes a Py_UNICODE buffer of the given size and returns a
422 Python string object. */
423
424extern DL_IMPORT(PyObject*) PyUnicode_Encode(
425 const Py_UNICODE *s, /* Unicode char buffer */
426 int size, /* number of Py_UNICODE chars to encode */
427 const char *encoding, /* encoding */
428 const char *errors /* error handling */
429 );
430
431/* Encodes a Unicode object and returns the result as Python string
432 object. */
433
434extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
435 PyObject *unicode, /* Unicode object */
436 const char *encoding, /* encoding */
437 const char *errors /* error handling */
438 );
439
440/* --- UTF-8 Codecs ------------------------------------------------------- */
441
442extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
443 const char *string, /* UTF-8 encoded string */
444 int length, /* size of string */
445 const char *errors /* error handling */
446 );
447
448extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
449 PyObject *unicode /* Unicode object */
450 );
451
452extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
453 const Py_UNICODE *data, /* Unicode char buffer */
454 int length, /* number of Py_UNICODE chars to encode */
455 const char *errors /* error handling */
456 );
457
458/* --- UTF-16 Codecs ------------------------------------------------------ */
459
Guido van Rossum9e896b32000-04-05 20:11:21 +0000460/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000461 the corresponding Unicode object.
462
463 errors (if non-NULL) defines the error handling. It defaults
464 to "strict".
465
466 If byteorder is non-NULL, the decoder starts decoding using the
467 given byte order:
468
469 *byteorder == -1: little endian
470 *byteorder == 0: native order
471 *byteorder == 1: big endian
472
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000473 In native mode, the first two bytes of the stream are checked for a
474 BOM mark. If found, the BOM mark is analysed, the byte order
475 adjusted and the BOM skipped. In the other modes, no BOM mark
476 interpretation is done. After completion, *byteorder is set to the
477 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000478
479 If byteorder is NULL, the codec starts in native order mode.
480
481*/
482
483extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
484 const char *string, /* UTF-16 encoded string */
485 int length, /* size of string */
486 const char *errors, /* error handling */
487 int *byteorder /* pointer to byteorder to use
488 0=native;-1=LE,1=BE; updated on
489 exit */
490 );
491
492/* Returns a Python string using the UTF-16 encoding in native byte
493 order. The string always starts with a BOM mark. */
494
495extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
496 PyObject *unicode /* Unicode object */
497 );
498
499/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000500 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000501
502 If byteorder is not 0, output is written according to the following
503 byte order:
504
505 byteorder == -1: little endian
506 byteorder == 0: native byte order (writes a BOM mark)
507 byteorder == 1: big endian
508
509 If byteorder is 0, the output string will always start with the
510 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
511 prepended.
512
513 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
514 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000515 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000516
517*/
518
519extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
520 const Py_UNICODE *data, /* Unicode char buffer */
521 int length, /* number of Py_UNICODE chars to encode */
522 const char *errors, /* error handling */
523 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
524 );
525
526/* --- Unicode-Escape Codecs ---------------------------------------------- */
527
528extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
529 const char *string, /* Unicode-Escape encoded string */
530 int length, /* size of string */
531 const char *errors /* error handling */
532 );
533
534extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
535 PyObject *unicode /* Unicode object */
536 );
537
538extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
539 const Py_UNICODE *data, /* Unicode char buffer */
540 int length /* Number of Py_UNICODE chars to encode */
541 );
542
543/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
544
545extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
546 const char *string, /* Raw-Unicode-Escape encoded string */
547 int length, /* size of string */
548 const char *errors /* error handling */
549 );
550
551extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
552 PyObject *unicode /* Unicode object */
553 );
554
555extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
556 const Py_UNICODE *data, /* Unicode char buffer */
557 int length /* Number of Py_UNICODE chars to encode */
558 );
559
560/* --- Latin-1 Codecs -----------------------------------------------------
561
562 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
563
564*/
565
566extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
567 const char *string, /* Latin-1 encoded string */
568 int length, /* size of string */
569 const char *errors /* error handling */
570 );
571
572extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
573 PyObject *unicode /* Unicode object */
574 );
575
576extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
577 const Py_UNICODE *data, /* Unicode char buffer */
578 int length, /* Number of Py_UNICODE chars to encode */
579 const char *errors /* error handling */
580 );
581
582/* --- ASCII Codecs -------------------------------------------------------
583
584 Only 7-bit ASCII data is excepted. All other codes generate errors.
585
586*/
587
588extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
589 const char *string, /* ASCII encoded string */
590 int length, /* size of string */
591 const char *errors /* error handling */
592 );
593
594extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
595 PyObject *unicode /* Unicode object */
596 );
597
598extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
599 const Py_UNICODE *data, /* Unicode char buffer */
600 int length, /* Number of Py_UNICODE chars to encode */
601 const char *errors /* error handling */
602 );
603
604/* --- Character Map Codecs -----------------------------------------------
605
606 This codec uses mappings to encode and decode characters.
607
608 Decoding mappings must map single string characters to single
609 Unicode characters, integers (which are then interpreted as Unicode
610 ordinals) or None (meaning "undefined mapping" and causing an
611 error).
612
613 Encoding mappings must map single Unicode characters to single
614 string characters, integers (which are then interpreted as Latin-1
615 ordinals) or None (meaning "undefined mapping" and causing an
616 error).
617
618 If a character lookup fails with a LookupError, the character is
619 copied as-is meaning that its ordinal value will be interpreted as
620 Unicode or Latin-1 ordinal resp. Because of this mappings only need
621 to contain those mappings which map characters to different code
622 points.
623
624*/
625
626extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
627 const char *string, /* Encoded string */
628 int length, /* size of string */
629 PyObject *mapping, /* character mapping
630 (char ordinal -> unicode ordinal) */
631 const char *errors /* error handling */
632 );
633
634extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
635 PyObject *unicode, /* Unicode object */
636 PyObject *mapping /* character mapping
637 (unicode ordinal -> char ordinal) */
638 );
639
640extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
641 const Py_UNICODE *data, /* Unicode char buffer */
642 int length, /* Number of Py_UNICODE chars to encode */
643 PyObject *mapping, /* character mapping
644 (unicode ordinal -> char ordinal) */
645 const char *errors /* error handling */
646 );
647
648/* Translate a Py_UNICODE buffer of the given length by applying a
649 character mapping table to it and return the resulting Unicode
650 object.
651
652 The mapping table must map Unicode ordinal integers to Unicode
653 ordinal integers or None (causing deletion of the character).
654
655 Mapping tables may be dictionaries or sequences. Unmapped character
656 ordinals (ones which cause a LookupError) are left untouched and
657 are copied as-is.
658
659*/
660
661extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
662 const Py_UNICODE *data, /* Unicode char buffer */
663 int length, /* Number of Py_UNICODE chars to encode */
664 PyObject *table, /* Translate table */
665 const char *errors /* error handling */
666 );
667
Guido van Rossumefec1152000-03-28 02:01:15 +0000668#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000669
Guido van Rossumefec1152000-03-28 02:01:15 +0000670/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000671
Guido van Rossumefec1152000-03-28 02:01:15 +0000672extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
673 const char *string, /* MBCS encoded string */
674 int length, /* size of string */
675 const char *errors /* error handling */
676 );
677
678extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
679 PyObject *unicode /* Unicode object */
680 );
681
682extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
683 const Py_UNICODE *data, /* Unicode char buffer */
684 int length, /* Number of Py_UNICODE chars to encode */
685 const char *errors /* error handling */
686 );
687
Guido van Rossumefec1152000-03-28 02:01:15 +0000688#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000689
Guido van Rossum9e896b32000-04-05 20:11:21 +0000690/* --- Decimal Encoder ---------------------------------------------------- */
691
692/* Takes a Unicode string holding a decimal value and writes it into
693 an output buffer using standard ASCII digit codes.
694
695 The output buffer has to provide at least length+1 bytes of storage
696 area. The output string is 0-terminated.
697
698 The encoder converts whitespace to ' ', decimal characters to their
699 corresponding ASCII digit and all other Latin-1 characters except
700 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
701 are treated as errors. This includes embedded NULL bytes.
702
703 Error handling is defined by the errors argument:
704
705 NULL or "strict": raise a ValueError
706 "ignore": ignore the wrong characters (these are not copied to the
707 output buffer)
708 "replace": replaces illegal characters with '?'
709
710 Returns 0 on success, -1 on failure.
711
712*/
713
714extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
715 Py_UNICODE *s, /* Unicode buffer */
716 int length, /* Number of Py_UNICODE chars to encode */
717 char *output, /* Output buffer; must have size >= length */
718 const char *errors /* error handling */
719 );
720
Guido van Rossumd8225182000-03-10 22:33:05 +0000721/* --- Methods & Slots ----------------------------------------------------
722
723 These are capable of handling Unicode objects and strings on input
724 (we refer to them as strings in the descriptions) and return
725 Unicode objects or integers as apporpriate. */
726
727/* Concat two strings giving a new Unicode string. */
728
729extern DL_IMPORT(PyObject*) PyUnicode_Concat(
730 PyObject *left, /* Left string */
731 PyObject *right /* Right string */
732 );
733
734/* Split a string giving a list of Unicode strings.
735
736 If sep is NULL, splitting will be done at all whitespace
737 substrings. Otherwise, splits occur at the given separator.
738
739 At most maxsplit splits will be done. If negative, no limit is set.
740
741 Separators are not included in the resulting list.
742
743*/
744
745extern DL_IMPORT(PyObject*) PyUnicode_Split(
746 PyObject *s, /* String to split */
747 PyObject *sep, /* String separator */
748 int maxsplit /* Maxsplit count */
749 );
750
751/* Dito, but split at line breaks.
752
753 CRLF is considered to be one line break. Line breaks are not
754 included in the resulting list. */
755
756extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
757 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000758 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000759 );
760
761/* Translate a string by applying a character mapping table to it and
762 return the resulting Unicode object.
763
764 The mapping table must map Unicode ordinal integers to Unicode
765 ordinal integers or None (causing deletion of the character).
766
767 Mapping tables may be dictionaries or sequences. Unmapped character
768 ordinals (ones which cause a LookupError) are left untouched and
769 are copied as-is.
770
771*/
772
773extern DL_IMPORT(PyObject *) PyUnicode_Translate(
774 PyObject *str, /* String */
775 PyObject *table, /* Translate table */
776 const char *errors /* error handling */
777 );
778
779/* Join a sequence of strings using the given separator and return
780 the resulting Unicode string. */
781
782extern DL_IMPORT(PyObject*) PyUnicode_Join(
783 PyObject *separator, /* Separator string */
784 PyObject *seq /* Sequence object */
785 );
786
787/* Return 1 if substr matches str[start:end] at the given tail end, 0
788 otherwise. */
789
790extern DL_IMPORT(int) PyUnicode_Tailmatch(
791 PyObject *str, /* String */
792 PyObject *substr, /* Prefix or Suffix string */
793 int start, /* Start index */
794 int end, /* Stop index */
795 int direction /* Tail end: -1 prefix, +1 suffix */
796 );
797
798/* Return the first position of substr in str[start:end] using the
799 given search direction or -1 if not found. */
800
801extern DL_IMPORT(int) PyUnicode_Find(
802 PyObject *str, /* String */
803 PyObject *substr, /* Substring to find */
804 int start, /* Start index */
805 int end, /* Stop index */
806 int direction /* Find direction: +1 forward, -1 backward */
807 );
808
Barry Warsaw51ac5802000-03-20 16:36:48 +0000809/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000810
811extern DL_IMPORT(int) PyUnicode_Count(
812 PyObject *str, /* String */
813 PyObject *substr, /* Substring to count */
814 int start, /* Start index */
815 int end /* Stop index */
816 );
817
Barry Warsaw51ac5802000-03-20 16:36:48 +0000818/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000819 and return the resulting Unicode object. */
820
821extern DL_IMPORT(PyObject *) PyUnicode_Replace(
822 PyObject *str, /* String */
823 PyObject *substr, /* Substring to find */
824 PyObject *replstr, /* Substring to replace */
825 int maxcount /* Max. number of replacements to apply;
826 -1 = all */
827 );
828
829/* Compare two strings and return -1, 0, 1 for less than, equal,
830 greater than resp. */
831
832extern DL_IMPORT(int) PyUnicode_Compare(
833 PyObject *left, /* Left string */
834 PyObject *right /* Right string */
835 );
836
Thomas Wouters7e474022000-07-16 12:04:32 +0000837/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000838 the resulting Unicode string. */
839
840extern DL_IMPORT(PyObject *) PyUnicode_Format(
841 PyObject *format, /* Format string */
842 PyObject *args /* Argument tuple or dictionary */
843 );
844
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000845/* Checks whether element is contained in container and return 1/0
846 accordingly.
847
848 element has to coerce to an one element Unicode string. -1 is
849 returned in case of an error. */
850
851extern DL_IMPORT(int) PyUnicode_Contains(
852 PyObject *container, /* Container string */
853 PyObject *element /* Element string */
854 );
855
Guido van Rossumd8225182000-03-10 22:33:05 +0000856/* === Characters Type APIs =============================================== */
857
858/* These should not be used directly. Use the Py_UNICODE_IS* and
859 Py_UNICODE_TO* macros instead.
860
861 These APIs are implemented in Objects/unicodectype.c.
862
863*/
864
865extern DL_IMPORT(int) _PyUnicode_IsLowercase(
866 register const Py_UNICODE ch /* Unicode character */
867 );
868
869extern DL_IMPORT(int) _PyUnicode_IsUppercase(
870 register const Py_UNICODE ch /* Unicode character */
871 );
872
873extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
874 register const Py_UNICODE ch /* Unicode character */
875 );
876
877extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
878 register const Py_UNICODE ch /* Unicode character */
879 );
880
881extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
882 register const Py_UNICODE ch /* Unicode character */
883 );
884
885extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
886 register const Py_UNICODE ch /* Unicode character */
887 );
888
889extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
890 register const Py_UNICODE ch /* Unicode character */
891 );
892
893extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
894 register const Py_UNICODE ch /* Unicode character */
895 );
896
897extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
898 register const Py_UNICODE ch /* Unicode character */
899 );
900
901extern DL_IMPORT(int) _PyUnicode_ToDigit(
902 register const Py_UNICODE ch /* Unicode character */
903 );
904
905extern DL_IMPORT(double) _PyUnicode_ToNumeric(
906 register const Py_UNICODE ch /* Unicode character */
907 );
908
909extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
910 register const Py_UNICODE ch /* Unicode character */
911 );
912
913extern DL_IMPORT(int) _PyUnicode_IsDigit(
914 register const Py_UNICODE ch /* Unicode character */
915 );
916
917extern DL_IMPORT(int) _PyUnicode_IsNumeric(
918 register const Py_UNICODE ch /* Unicode character */
919 );
920
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000921extern DL_IMPORT(int) _PyUnicode_IsAlpha(
922 register const Py_UNICODE ch /* Unicode character */
923 );
924
Guido van Rossumd8225182000-03-10 22:33:05 +0000925#ifdef __cplusplus
926}
927#endif
928#endif /* !Py_UNICODEOBJECT_H */