blob: fc1760e3831d60411164bea4eaf855b60263b735 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.h : interface for the encoding conversion functions needed for
3 * XML
4 *
5 * Related specs:
6 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * See Copyright for the status of this software.
17 *
18 * Daniel.Veillard@w3.org
19 */
20
21#ifndef __XML_CHAR_ENCODING_H__
22#define __XML_CHAR_ENCODING_H__
23
24#include <libxml/xmlversion.h>
25#ifdef LIBXML_ICONV_ENABLED
26#include <iconv.h>
27#endif
28#include <libxml/tree.h>
29
30#ifdef __cplusplus
31extern "C" {
32#endif
33
34/**
Daniel Veillardf69bb4b2001-05-19 13:24:56 +000035 * xmlCharEncoding:
36 *
Owen Taylor3473f882001-02-23 17:55:21 +000037 * Predefined values for some standard encodings
38 * Libxml don't do beforehand translation on UTF8, ISOLatinX
39 * It also support UTF16 (LE and BE) by default.
40 *
41 * Anything else would have to be translated to UTF8 before being
42 * given to the parser itself. The BOM for UTF16 and the encoding
43 * declaration are looked at and a converter is looked for at that
44 * point. If not found the parser stops here as asked by the XML REC
45 * Converter can be registered by the user using xmlRegisterCharEncodingHandler
46 * but the currentl form doesn't allow stateful transcoding (a serious
47 * problem agreed !). If iconv has been found it will be used
48 * automatically and allow stateful transcoding, the simplest is then
49 * to be sure to enable icon and to provide iconv libs for the encoding
50 * support needed.
51 */
52typedef enum {
53 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
54 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */
55 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */
56 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */
57 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */
58 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */
59 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */
60 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */
61 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */
62 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */
63 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */
64 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */
65 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */
66 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */
67 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */
68 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */
69 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */
70 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */
71 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */
72 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */
73 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */
74 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */
75 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */
76 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */
77} xmlCharEncoding;
78
79/**
80 * xmlCharEncodingInputFunc:
81 * @out: a pointer ot an array of bytes to store the UTF-8 result
82 * @outlen: the lenght of @out
83 * @in: a pointer ot an array of chars in the original encoding
84 * @inlen: the lenght of @in
85 *
86 * Take a block of chars in the original encoding and try to convert
87 * it to an UTF-8 block of chars out.
88 *
89 * Returns the number of byte written, or -1 by lack of space, or -2
90 * if the transcoding failed.
91 * The value of @inlen after return is the number of octets consumed
92 * as the return value is positive, else unpredictiable.
93 * The value of @outlen after return is the number of ocetes consumed.
94 */
95typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
96 const unsigned char* in, int *inlen);
97
98
99/**
100 * xmlCharEncodingOutputFunc:
101 * @out: a pointer ot an array of bytes to store the result
102 * @outlen: the lenght of @out
103 * @in: a pointer ot an array of UTF-8 chars
104 * @inlen: the lenght of @in
105 *
106 * Take a block of UTF-8 chars in and try to convert it to an other
107 * encoding.
108 * Note: a first call designed to produce heading info is called with
109 * in = NULL. If stateful this should also initialize the encoder state
110 *
111 * Returns the number of byte written, or -1 by lack of space, or -2
112 * if the transcoding failed.
113 * The value of @inlen after return is the number of octets consumed
114 * as the return value is positive, else unpredictiable.
115 * The value of @outlen after return is the number of ocetes consumed.
116 */
117typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
118 const unsigned char* in, int *inlen);
119
120
121/*
122 * Block defining the handlers for non UTF-8 encodings.
123 * If iconv is supported, there is two extra fields
124 */
125
126typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
127typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
128struct _xmlCharEncodingHandler {
129 char *name;
130 xmlCharEncodingInputFunc input;
131 xmlCharEncodingOutputFunc output;
132#ifdef LIBXML_ICONV_ENABLED
133 iconv_t iconv_in;
134 iconv_t iconv_out;
135#endif /* LIBXML_ICONV_ENABLED */
136};
137
138/*
139 * Interfaces for encoding handlers
140 */
141void xmlInitCharEncodingHandlers (void);
142void xmlCleanupCharEncodingHandlers (void);
143void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler);
144xmlCharEncodingHandlerPtr
145 xmlGetCharEncodingHandler (xmlCharEncoding enc);
146xmlCharEncodingHandlerPtr
147 xmlFindCharEncodingHandler (const char *name);
148
149
150/*
151 * Interfaces for encoding names and aliases
152 */
153int xmlAddEncodingAlias (const char *name,
154 const char *alias);
155int xmlDelEncodingAlias (const char *alias);
156const char *
157 xmlGetEncodingAlias (const char *alias);
158void xmlCleanupEncodingAliases (void);
159xmlCharEncoding
160 xmlParseCharEncoding (const char* name);
161const char*
162 xmlGetCharEncodingName (xmlCharEncoding enc);
163
164/*
165 * Interfaces directly used by the parsers.
166 */
167xmlCharEncoding
168 xmlDetectCharEncoding (const unsigned char* in,
169 int len);
170
Owen Taylor3473f882001-02-23 17:55:21 +0000171int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
172 xmlBufferPtr out,
173 xmlBufferPtr in);
174
175int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
176 xmlBufferPtr out,
177 xmlBufferPtr in);
178int xmlCharEncFirstLine (xmlCharEncodingHandler *handler,
179 xmlBufferPtr out,
180 xmlBufferPtr in);
181int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
182
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000183/*
184 * Export a few useful functions
185 */
186int UTF8Toisolat1 (unsigned char* out,
187 int *outlen,
188 const unsigned char* in,
189 int *inlen);
190int isolat1ToUTF8 (unsigned char* out,
191 int *outlen,
192 const unsigned char* in,
193 int *inlen);
Daniel Veillard97ac1312001-05-30 19:14:17 +0000194/*
195 * exports additional "UTF-8 aware" string routines which are
196 */
197
Daniel Veillarde043ee12001-04-16 14:08:07 +0000198int xmlCheckUTF8 (const unsigned char *utf);
Daniel Veillard97ac1312001-05-30 19:14:17 +0000199
200int xmlUTF8Strsize (const xmlChar *utf,
201 int len);
202xmlChar * xmlUTF8Strndup (const xmlChar *utf,
203 int len);
204xmlChar * xmlUTF8Strpos (const xmlChar *utf,
205 int pos);
206int xmlUTF8Strloc (const xmlChar *utf,
207 const xmlChar *utfchar);
208xmlChar * xmlUTF8Strsub (const xmlChar *utf,
209 int start,
210 int len);
211
212int xmlUTF8Strlen (const xmlChar *utf);
Daniel Veillarde043ee12001-04-16 14:08:07 +0000213
Owen Taylor3473f882001-02-23 17:55:21 +0000214#ifdef __cplusplus
215}
216#endif
217
218#endif /* __XML_CHAR_ENCODING_H__ */