blob: f91a5a0c8c7f35e0b7fbd8bb3b83e437bd89084e [file] [log] [blame]
Guido van Rossumd8225182000-03-10 22:33:05 +00001#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
Guido van Rossumd8225182000-03-10 22:33:05 +00003
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd8225182000-03-10 22:33:05 +000011
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71# define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
Marc-André Lemburg1a731c62000-08-11 11:43:10 +000081# ifndef HAVE_WCHAR_H
82# define HAVE_WCHAR_H
83# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000084#endif
85
86#ifdef HAVE_WCHAR_H
Guido van Rossum24bdb042000-03-28 20:29:59 +000087/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
88# ifdef _HAVE_BSDI
89# include <time.h>
90# endif
Guido van Rossumd8225182000-03-10 22:33:05 +000091# include "wchar.h"
92#endif
93
94#ifdef HAVE_USABLE_WCHAR_T
95
96/* If the compiler defines whcar_t as a 16-bit unsigned type we can
97 use the compiler type directly. Works fine with all modern Windows
98 platforms. */
99
100typedef wchar_t Py_UNICODE;
101
102#else
103
104/* Use if you have a standard ANSI compiler, without wchar_t support.
105 If a short is not 16 bits on your platform, you have to fix the
106 typedef below, or the module initialization code will complain. */
107
108typedef unsigned short Py_UNICODE;
109
110#endif
111
Marc-André Lemburg43279102000-07-07 09:01:41 +0000112/*
113 * Use this typedef when you need to represent a UTF-16 surrogate pair
114 * as single unsigned integer.
115 */
116#if SIZEOF_INT >= 4
117typedef unsigned int Py_UCS4;
118#elif SIZEOF_LONG >= 4
119typedef unsigned long Py_UCS4;
120#endif
121
122
Guido van Rossumd8225182000-03-10 22:33:05 +0000123/* --- Internal Unicode Operations ---------------------------------------- */
124
125/* If you want Python to use the compiler's wctype.h functions instead
Barry Warsaw51ac5802000-03-20 16:36:48 +0000126 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
127 configure Python using --with-ctype-functions. This reduces the
128 interpreter's code size. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000129
130#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
131
132#include "wctype.h"
133
134#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
135
136#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
137#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
138#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
139#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
140
141#define Py_UNICODE_TOLOWER(ch) towlower(ch)
142#define Py_UNICODE_TOUPPER(ch) towupper(ch)
143#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
144
145#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
146#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
147#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
148
149#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
150#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
151#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
152
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000153#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
154
Guido van Rossumd8225182000-03-10 22:33:05 +0000155#else
156
157#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
158
159#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
160#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
161#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
162#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
163
164#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
165#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
166#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
167
168#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
169#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
170#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
171
172#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
173#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
174#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
175
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000176#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
Guido van Rossumd8225182000-03-10 22:33:05 +0000177
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000178#endif
Marc-André Lemburga9c103b2000-07-03 10:52:13 +0000179
180#define Py_UNICODE_ISALNUM(ch) \
181 (Py_UNICODE_ISALPHA(ch) || \
182 Py_UNICODE_ISDECIMAL(ch) || \
183 Py_UNICODE_ISDIGIT(ch) || \
184 Py_UNICODE_ISNUMERIC(ch))
185
Guido van Rossumd8225182000-03-10 22:33:05 +0000186#define Py_UNICODE_COPY(target, source, length)\
187 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
188
189#define Py_UNICODE_FILL(target, value, length) do\
190 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
191 while (0)
192
193#define Py_UNICODE_MATCH(string, offset, substring)\
Marc-André Lemburg2f4d0e92000-06-18 22:22:27 +0000194 ((*((string)->str + (offset)) == *((substring)->str)) &&\
195 !memcmp((string)->str + (offset), (substring)->str,\
Guido van Rossumd8225182000-03-10 22:33:05 +0000196 (substring)->length*sizeof(Py_UNICODE)))
197
Barry Warsaw51ac5802000-03-20 16:36:48 +0000198#ifdef __cplusplus
199extern "C" {
200#endif
201
Guido van Rossumd8225182000-03-10 22:33:05 +0000202/* --- Unicode Type ------------------------------------------------------- */
203
204typedef struct {
205 PyObject_HEAD
206 int length; /* Length of raw Unicode data in buffer */
207 Py_UNICODE *str; /* Raw Unicode buffer */
208 long hash; /* Hash value; -1 if not set */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000209 PyObject *defenc; /* (Default) Encoded version as Python
210 string, or NULL; this is used for
211 implementing the buffer protocol */
Guido van Rossumd8225182000-03-10 22:33:05 +0000212} PyUnicodeObject;
213
214extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
215
216#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
217
218/* Fast access macros */
219#define PyUnicode_GET_SIZE(op) \
220 (((PyUnicodeObject *)(op))->length)
221#define PyUnicode_GET_DATA_SIZE(op) \
222 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
223#define PyUnicode_AS_UNICODE(op) \
224 (((PyUnicodeObject *)(op))->str)
225#define PyUnicode_AS_DATA(op) \
226 ((const char *)((PyUnicodeObject *)(op))->str)
227
228/* --- Constants ---------------------------------------------------------- */
229
230/* This Unicode character will be used as replacement character during
231 decoding if the errors argument is set to "replace". Note: the
232 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
233 Unicode 3.0. */
234
235#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
236
237/* === Public API ========================================================= */
238
239/* --- Plain Py_UNICODE --------------------------------------------------- */
240
241/* Create a Unicode Object from the Py_UNICODE buffer u of the given
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000242 size.
243
244 u may be NULL which causes the contents to be undefined. It is the
245 user's responsibility to fill in the needed data afterwards. Note
246 that modifying the Unicode object contents after construction is
247 only allowed if u was set to NULL.
Guido van Rossumd8225182000-03-10 22:33:05 +0000248
249 The buffer is copied into the new object. */
250
251extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
252 const Py_UNICODE *u, /* Unicode buffer */
253 int size /* size of buffer */
254 );
255
256/* Return a read-only pointer to the Unicode object's internal
257 Py_UNICODE buffer. */
258
259extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
260 PyObject *unicode /* Unicode object */
261 );
262
263/* Get the length of the Unicode object. */
264
265extern DL_IMPORT(int) PyUnicode_GetSize(
266 PyObject *unicode /* Unicode object */
267 );
268
Guido van Rossum52c23592000-04-10 13:41:41 +0000269/* Resize an already allocated Unicode object to the new size length.
270
271 *unicode is modified to point to the new (resized) object and 0
272 returned on success.
273
274 This API may only be called by the function which also called the
275 Unicode constructor. The refcount on the object must be 1. Otherwise,
276 an error is returned.
277
278 Error handling is implemented as follows: an exception is set, -1
279 is returned and *unicode left untouched.
280
281*/
282
283extern DL_IMPORT(int) PyUnicode_Resize(
284 PyObject **unicode, /* Pointer to the Unicode object */
285 int length /* New length */
286 );
287
Guido van Rossumd8225182000-03-10 22:33:05 +0000288/* Coerce obj to an Unicode object and return a reference with
289 *incremented* refcount.
290
291 Coercion is done in the following way:
292
293 1. Unicode objects are passed back as-is with incremented
294 refcount.
295
296 2. String and other char buffer compatible objects are decoded
Fred Drakecb093fe2000-05-09 19:51:53 +0000297 under the assumptions that they contain data using the current
298 default encoding. Decoding is done in "strict" mode.
Guido van Rossumd8225182000-03-10 22:33:05 +0000299
300 3. All other objects raise an exception.
301
302 The API returns NULL in case of an error. The caller is responsible
303 for decref'ing the returned objects.
304
305*/
306
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000307extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
308 register PyObject *obj, /* Object */
309 const char *encoding, /* encoding */
310 const char *errors /* error handling */
311 );
312
313/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
314 which results in using the default encoding as basis for
315 decoding the object.
316
317 Coerces obj to an Unicode object and return a reference with
318 *incremented* refcount.
319
320 The API returns NULL in case of an error. The caller is responsible
321 for decref'ing the returned objects.
322
323*/
324
Guido van Rossumd8225182000-03-10 22:33:05 +0000325extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
326 register PyObject *obj /* Object */
327 );
328
329/* --- wchar_t support for platforms which support it --------------------- */
330
331#ifdef HAVE_WCHAR_H
332
333/* Create a Unicode Object from the whcar_t buffer w of the given
334 size.
335
336 The buffer is copied into the new object. */
337
338extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
339 register const wchar_t *w, /* wchar_t buffer */
340 int size /* size of buffer */
341 );
342
343/* Copies the Unicode Object contents into the whcar_t buffer w. At
344 most size wchar_t characters are copied.
345
346 Returns the number of wchar_t characters copied or -1 in case of an
347 error. */
348
349extern DL_IMPORT(int) PyUnicode_AsWideChar(
350 PyUnicodeObject *unicode, /* Unicode object */
351 register wchar_t *w, /* wchar_t buffer */
352 int size /* size of buffer */
353 );
354
355#endif
356
357/* === Builtin Codecs =====================================================
358
359 Many of these APIs take two arguments encoding and errors. These
360 parameters encoding and errors have the same semantics as the ones
361 of the builtin unicode() API.
362
Fred Drakecb093fe2000-05-09 19:51:53 +0000363 Setting encoding to NULL causes the default encoding to be used.
Guido van Rossumd8225182000-03-10 22:33:05 +0000364
365 Error handling is set by errors which may also be set to NULL
366 meaning to use the default handling defined for the codec. Default
367 error handling for all builtin codecs is "strict" (ValueErrors are
368 raised).
369
370 The codecs all use a similar interface. Only deviation from the
371 generic ones are documented.
372
373*/
374
Fred Drakecb093fe2000-05-09 19:51:53 +0000375/* --- Manage the default encoding ---------------------------------------- */
376
377/* Returns the currently active default encoding.
378
379 The default encoding is currently implemented as run-time settable
380 process global. This may change in future versions of the
381 interpreter to become a parameter which is managed on a per-thread
382 basis.
383
384 */
385
Thomas Wouters5f375912000-07-22 23:30:03 +0000386extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
Fred Drakecb093fe2000-05-09 19:51:53 +0000387
388/* Sets the currently active default encoding.
389
390 Returns 0 on success, -1 in case of an error.
391
392 */
393
394extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
395 const char *encoding /* Encoding name in standard form */
396 );
397
Guido van Rossumd8225182000-03-10 22:33:05 +0000398/* --- Generic Codecs ----------------------------------------------------- */
399
400/* Create a Unicode object by decoding the encoded string s of the
401 given size. */
402
403extern DL_IMPORT(PyObject*) PyUnicode_Decode(
404 const char *s, /* encoded string */
405 int size, /* size of buffer */
406 const char *encoding, /* encoding */
407 const char *errors /* error handling */
408 );
409
410/* Encodes a Py_UNICODE buffer of the given size and returns a
411 Python string object. */
412
413extern DL_IMPORT(PyObject*) PyUnicode_Encode(
414 const Py_UNICODE *s, /* Unicode char buffer */
415 int size, /* number of Py_UNICODE chars to encode */
416 const char *encoding, /* encoding */
417 const char *errors /* error handling */
418 );
419
420/* Encodes a Unicode object and returns the result as Python string
421 object. */
422
423extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
424 PyObject *unicode, /* Unicode object */
425 const char *encoding, /* encoding */
426 const char *errors /* error handling */
427 );
428
429/* --- UTF-8 Codecs ------------------------------------------------------- */
430
431extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
432 const char *string, /* UTF-8 encoded string */
433 int length, /* size of string */
434 const char *errors /* error handling */
435 );
436
437extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
438 PyObject *unicode /* Unicode object */
439 );
440
441extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
442 const Py_UNICODE *data, /* Unicode char buffer */
443 int length, /* number of Py_UNICODE chars to encode */
444 const char *errors /* error handling */
445 );
446
447/* --- UTF-16 Codecs ------------------------------------------------------ */
448
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449/* Decodes length bytes from a UTF-16 encoded buffer string and returns
Guido van Rossumd8225182000-03-10 22:33:05 +0000450 the corresponding Unicode object.
451
452 errors (if non-NULL) defines the error handling. It defaults
453 to "strict".
454
455 If byteorder is non-NULL, the decoder starts decoding using the
456 given byte order:
457
458 *byteorder == -1: little endian
459 *byteorder == 0: native order
460 *byteorder == 1: big endian
461
Marc-André Lemburg489b56e2001-05-21 20:30:15 +0000462 In native mode, the first two bytes of the stream are checked for a
463 BOM mark. If found, the BOM mark is analysed, the byte order
464 adjusted and the BOM skipped. In the other modes, no BOM mark
465 interpretation is done. After completion, *byteorder is set to the
466 current byte order at the end of input data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000467
468 If byteorder is NULL, the codec starts in native order mode.
469
470*/
471
472extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
473 const char *string, /* UTF-16 encoded string */
474 int length, /* size of string */
475 const char *errors, /* error handling */
476 int *byteorder /* pointer to byteorder to use
477 0=native;-1=LE,1=BE; updated on
478 exit */
479 );
480
481/* Returns a Python string using the UTF-16 encoding in native byte
482 order. The string always starts with a BOM mark. */
483
484extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
485 PyObject *unicode /* Unicode object */
486 );
487
488/* Returns a Python string object holding the UTF-16 encoded value of
Guido van Rossum9e896b32000-04-05 20:11:21 +0000489 the Unicode data.
Guido van Rossumd8225182000-03-10 22:33:05 +0000490
491 If byteorder is not 0, output is written according to the following
492 byte order:
493
494 byteorder == -1: little endian
495 byteorder == 0: native byte order (writes a BOM mark)
496 byteorder == 1: big endian
497
498 If byteorder is 0, the output string will always start with the
499 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
500 prepended.
501
502 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
503 UCS-2. This trick makes it possible to add full UTF-16 capabilities
Thomas Wouters7e474022000-07-16 12:04:32 +0000504 at a later point without compromising the APIs.
Guido van Rossumd8225182000-03-10 22:33:05 +0000505
506*/
507
508extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
509 const Py_UNICODE *data, /* Unicode char buffer */
510 int length, /* number of Py_UNICODE chars to encode */
511 const char *errors, /* error handling */
512 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
513 );
514
515/* --- Unicode-Escape Codecs ---------------------------------------------- */
516
517extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
518 const char *string, /* Unicode-Escape encoded string */
519 int length, /* size of string */
520 const char *errors /* error handling */
521 );
522
523extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
524 PyObject *unicode /* Unicode object */
525 );
526
527extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
528 const Py_UNICODE *data, /* Unicode char buffer */
529 int length /* Number of Py_UNICODE chars to encode */
530 );
531
532/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
533
534extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
535 const char *string, /* Raw-Unicode-Escape encoded string */
536 int length, /* size of string */
537 const char *errors /* error handling */
538 );
539
540extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
541 PyObject *unicode /* Unicode object */
542 );
543
544extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
545 const Py_UNICODE *data, /* Unicode char buffer */
546 int length /* Number of Py_UNICODE chars to encode */
547 );
548
549/* --- Latin-1 Codecs -----------------------------------------------------
550
551 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
552
553*/
554
555extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
556 const char *string, /* Latin-1 encoded string */
557 int length, /* size of string */
558 const char *errors /* error handling */
559 );
560
561extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
562 PyObject *unicode /* Unicode object */
563 );
564
565extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
566 const Py_UNICODE *data, /* Unicode char buffer */
567 int length, /* Number of Py_UNICODE chars to encode */
568 const char *errors /* error handling */
569 );
570
571/* --- ASCII Codecs -------------------------------------------------------
572
573 Only 7-bit ASCII data is excepted. All other codes generate errors.
574
575*/
576
577extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
578 const char *string, /* ASCII encoded string */
579 int length, /* size of string */
580 const char *errors /* error handling */
581 );
582
583extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
584 PyObject *unicode /* Unicode object */
585 );
586
587extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
588 const Py_UNICODE *data, /* Unicode char buffer */
589 int length, /* Number of Py_UNICODE chars to encode */
590 const char *errors /* error handling */
591 );
592
593/* --- Character Map Codecs -----------------------------------------------
594
595 This codec uses mappings to encode and decode characters.
596
597 Decoding mappings must map single string characters to single
598 Unicode characters, integers (which are then interpreted as Unicode
599 ordinals) or None (meaning "undefined mapping" and causing an
600 error).
601
602 Encoding mappings must map single Unicode characters to single
603 string characters, integers (which are then interpreted as Latin-1
604 ordinals) or None (meaning "undefined mapping" and causing an
605 error).
606
607 If a character lookup fails with a LookupError, the character is
608 copied as-is meaning that its ordinal value will be interpreted as
609 Unicode or Latin-1 ordinal resp. Because of this mappings only need
610 to contain those mappings which map characters to different code
611 points.
612
613*/
614
615extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
616 const char *string, /* Encoded string */
617 int length, /* size of string */
618 PyObject *mapping, /* character mapping
619 (char ordinal -> unicode ordinal) */
620 const char *errors /* error handling */
621 );
622
623extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
624 PyObject *unicode, /* Unicode object */
625 PyObject *mapping /* character mapping
626 (unicode ordinal -> char ordinal) */
627 );
628
629extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
630 const Py_UNICODE *data, /* Unicode char buffer */
631 int length, /* Number of Py_UNICODE chars to encode */
632 PyObject *mapping, /* character mapping
633 (unicode ordinal -> char ordinal) */
634 const char *errors /* error handling */
635 );
636
637/* Translate a Py_UNICODE buffer of the given length by applying a
638 character mapping table to it and return the resulting Unicode
639 object.
640
641 The mapping table must map Unicode ordinal integers to Unicode
642 ordinal integers or None (causing deletion of the character).
643
644 Mapping tables may be dictionaries or sequences. Unmapped character
645 ordinals (ones which cause a LookupError) are left untouched and
646 are copied as-is.
647
648*/
649
650extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
651 const Py_UNICODE *data, /* Unicode char buffer */
652 int length, /* Number of Py_UNICODE chars to encode */
653 PyObject *table, /* Translate table */
654 const char *errors /* error handling */
655 );
656
Guido van Rossumefec1152000-03-28 02:01:15 +0000657#ifdef MS_WIN32
Guido van Rossum24bdb042000-03-28 20:29:59 +0000658
Guido van Rossumefec1152000-03-28 02:01:15 +0000659/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000660
Guido van Rossumefec1152000-03-28 02:01:15 +0000661extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
662 const char *string, /* MBCS encoded string */
663 int length, /* size of string */
664 const char *errors /* error handling */
665 );
666
667extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
668 PyObject *unicode /* Unicode object */
669 );
670
671extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
672 const Py_UNICODE *data, /* Unicode char buffer */
673 int length, /* Number of Py_UNICODE chars to encode */
674 const char *errors /* error handling */
675 );
676
Guido van Rossumefec1152000-03-28 02:01:15 +0000677#endif /* MS_WIN32 */
Guido van Rossum24bdb042000-03-28 20:29:59 +0000678
Guido van Rossum9e896b32000-04-05 20:11:21 +0000679/* --- Decimal Encoder ---------------------------------------------------- */
680
681/* Takes a Unicode string holding a decimal value and writes it into
682 an output buffer using standard ASCII digit codes.
683
684 The output buffer has to provide at least length+1 bytes of storage
685 area. The output string is 0-terminated.
686
687 The encoder converts whitespace to ' ', decimal characters to their
688 corresponding ASCII digit and all other Latin-1 characters except
689 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
690 are treated as errors. This includes embedded NULL bytes.
691
692 Error handling is defined by the errors argument:
693
694 NULL or "strict": raise a ValueError
695 "ignore": ignore the wrong characters (these are not copied to the
696 output buffer)
697 "replace": replaces illegal characters with '?'
698
699 Returns 0 on success, -1 on failure.
700
701*/
702
703extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
704 Py_UNICODE *s, /* Unicode buffer */
705 int length, /* Number of Py_UNICODE chars to encode */
706 char *output, /* Output buffer; must have size >= length */
707 const char *errors /* error handling */
708 );
709
Guido van Rossumd8225182000-03-10 22:33:05 +0000710/* --- Methods & Slots ----------------------------------------------------
711
712 These are capable of handling Unicode objects and strings on input
713 (we refer to them as strings in the descriptions) and return
714 Unicode objects or integers as apporpriate. */
715
716/* Concat two strings giving a new Unicode string. */
717
718extern DL_IMPORT(PyObject*) PyUnicode_Concat(
719 PyObject *left, /* Left string */
720 PyObject *right /* Right string */
721 );
722
723/* Split a string giving a list of Unicode strings.
724
725 If sep is NULL, splitting will be done at all whitespace
726 substrings. Otherwise, splits occur at the given separator.
727
728 At most maxsplit splits will be done. If negative, no limit is set.
729
730 Separators are not included in the resulting list.
731
732*/
733
734extern DL_IMPORT(PyObject*) PyUnicode_Split(
735 PyObject *s, /* String to split */
736 PyObject *sep, /* String separator */
737 int maxsplit /* Maxsplit count */
738 );
739
740/* Dito, but split at line breaks.
741
742 CRLF is considered to be one line break. Line breaks are not
743 included in the resulting list. */
744
745extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
746 PyObject *s, /* String to split */
Guido van Rossum004d64f2000-04-11 15:39:46 +0000747 int keepends /* If true, line end markers are included */
Guido van Rossumd8225182000-03-10 22:33:05 +0000748 );
749
750/* Translate a string by applying a character mapping table to it and
751 return the resulting Unicode object.
752
753 The mapping table must map Unicode ordinal integers to Unicode
754 ordinal integers or None (causing deletion of the character).
755
756 Mapping tables may be dictionaries or sequences. Unmapped character
757 ordinals (ones which cause a LookupError) are left untouched and
758 are copied as-is.
759
760*/
761
762extern DL_IMPORT(PyObject *) PyUnicode_Translate(
763 PyObject *str, /* String */
764 PyObject *table, /* Translate table */
765 const char *errors /* error handling */
766 );
767
768/* Join a sequence of strings using the given separator and return
769 the resulting Unicode string. */
770
771extern DL_IMPORT(PyObject*) PyUnicode_Join(
772 PyObject *separator, /* Separator string */
773 PyObject *seq /* Sequence object */
774 );
775
776/* Return 1 if substr matches str[start:end] at the given tail end, 0
777 otherwise. */
778
779extern DL_IMPORT(int) PyUnicode_Tailmatch(
780 PyObject *str, /* String */
781 PyObject *substr, /* Prefix or Suffix string */
782 int start, /* Start index */
783 int end, /* Stop index */
784 int direction /* Tail end: -1 prefix, +1 suffix */
785 );
786
787/* Return the first position of substr in str[start:end] using the
788 given search direction or -1 if not found. */
789
790extern DL_IMPORT(int) PyUnicode_Find(
791 PyObject *str, /* String */
792 PyObject *substr, /* Substring to find */
793 int start, /* Start index */
794 int end, /* Stop index */
795 int direction /* Find direction: +1 forward, -1 backward */
796 );
797
Barry Warsaw51ac5802000-03-20 16:36:48 +0000798/* Count the number of occurrences of substr in str[start:end]. */
Guido van Rossumd8225182000-03-10 22:33:05 +0000799
800extern DL_IMPORT(int) PyUnicode_Count(
801 PyObject *str, /* String */
802 PyObject *substr, /* Substring to count */
803 int start, /* Start index */
804 int end /* Stop index */
805 );
806
Barry Warsaw51ac5802000-03-20 16:36:48 +0000807/* Replace at most maxcount occurrences of substr in str with replstr
Guido van Rossumd8225182000-03-10 22:33:05 +0000808 and return the resulting Unicode object. */
809
810extern DL_IMPORT(PyObject *) PyUnicode_Replace(
811 PyObject *str, /* String */
812 PyObject *substr, /* Substring to find */
813 PyObject *replstr, /* Substring to replace */
814 int maxcount /* Max. number of replacements to apply;
815 -1 = all */
816 );
817
818/* Compare two strings and return -1, 0, 1 for less than, equal,
819 greater than resp. */
820
821extern DL_IMPORT(int) PyUnicode_Compare(
822 PyObject *left, /* Left string */
823 PyObject *right /* Right string */
824 );
825
Thomas Wouters7e474022000-07-16 12:04:32 +0000826/* Apply a argument tuple or dictionary to a format string and return
Guido van Rossumd8225182000-03-10 22:33:05 +0000827 the resulting Unicode string. */
828
829extern DL_IMPORT(PyObject *) PyUnicode_Format(
830 PyObject *format, /* Format string */
831 PyObject *args /* Argument tuple or dictionary */
832 );
833
Guido van Rossumd0d366b2000-03-13 23:22:24 +0000834/* Checks whether element is contained in container and return 1/0
835 accordingly.
836
837 element has to coerce to an one element Unicode string. -1 is
838 returned in case of an error. */
839
840extern DL_IMPORT(int) PyUnicode_Contains(
841 PyObject *container, /* Container string */
842 PyObject *element /* Element string */
843 );
844
Guido van Rossumd8225182000-03-10 22:33:05 +0000845/* === Characters Type APIs =============================================== */
846
847/* These should not be used directly. Use the Py_UNICODE_IS* and
848 Py_UNICODE_TO* macros instead.
849
850 These APIs are implemented in Objects/unicodectype.c.
851
852*/
853
854extern DL_IMPORT(int) _PyUnicode_IsLowercase(
855 register const Py_UNICODE ch /* Unicode character */
856 );
857
858extern DL_IMPORT(int) _PyUnicode_IsUppercase(
859 register const Py_UNICODE ch /* Unicode character */
860 );
861
862extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
863 register const Py_UNICODE ch /* Unicode character */
864 );
865
866extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
867 register const Py_UNICODE ch /* Unicode character */
868 );
869
870extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
871 register const Py_UNICODE ch /* Unicode character */
872 );
873
874extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
875 register const Py_UNICODE ch /* Unicode character */
876 );
877
878extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
879 register const Py_UNICODE ch /* Unicode character */
880 );
881
882extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
883 register const Py_UNICODE ch /* Unicode character */
884 );
885
886extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
887 register const Py_UNICODE ch /* Unicode character */
888 );
889
890extern DL_IMPORT(int) _PyUnicode_ToDigit(
891 register const Py_UNICODE ch /* Unicode character */
892 );
893
894extern DL_IMPORT(double) _PyUnicode_ToNumeric(
895 register const Py_UNICODE ch /* Unicode character */
896 );
897
898extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
899 register const Py_UNICODE ch /* Unicode character */
900 );
901
902extern DL_IMPORT(int) _PyUnicode_IsDigit(
903 register const Py_UNICODE ch /* Unicode character */
904 );
905
906extern DL_IMPORT(int) _PyUnicode_IsNumeric(
907 register const Py_UNICODE ch /* Unicode character */
908 );
909
Marc-André Lemburgf03e7412000-07-05 09:45:59 +0000910extern DL_IMPORT(int) _PyUnicode_IsAlpha(
911 register const Py_UNICODE ch /* Unicode character */
912 );
913
Guido van Rossumd8225182000-03-10 22:33:05 +0000914#ifdef __cplusplus
915}
916#endif
917#endif /* !Py_UNICODEOBJECT_H */