blob: b98f2a33e259bdcd87b0e6b7e62ec6e2f5ad7f70 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * parser.h : Interfaces, constants and types related to the XML parser.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifndef __XML_PARSER_H__
10#define __XML_PARSER_H__
11
12#include <libxml/tree.h>
13#include <libxml/valid.h>
14#include <libxml/xmlIO.h>
15#include <libxml/entities.h>
16
17
18#ifdef __cplusplus
19extern "C" {
20#endif
21
22/*
23 * Constants.
24 */
25#define XML_DEFAULT_VERSION "1.0"
26
27/**
28 * an xmlParserInput is an input flow for the XML processor.
29 * Each entity parsed is associated an xmlParserInput (except the
30 * few predefined ones). This is the case both for internal entities
31 * - in which case the flow is already completely in memory - or
32 * external entities - in which case we use the buf structure for
33 * progressive reading and I18N conversions to the internal UTF-8 format.
34 */
35
36typedef void (* xmlParserInputDeallocate)(xmlChar *);
37typedef struct _xmlParserInput xmlParserInput;
38typedef xmlParserInput *xmlParserInputPtr;
39struct _xmlParserInput {
40 /* Input buffer */
41 xmlParserInputBufferPtr buf; /* UTF-8 encoded buffer */
42
43 const char *filename; /* The file analyzed, if any */
44 const char *directory; /* the directory/base of teh file */
45 const xmlChar *base; /* Base of the array to parse */
46 const xmlChar *cur; /* Current char being parsed */
47 int length; /* length if known */
48 int line; /* Current line */
49 int col; /* Current column */
50 int consumed; /* How many xmlChars already consumed */
51 xmlParserInputDeallocate free; /* function to deallocate the base */
52 const xmlChar *encoding; /* the encoding string for entity */
53 const xmlChar *version; /* the version string for entity */
54 int standalone; /* Was that entity marked standalone */
55};
56
57/**
58 * the parser can be asked to collect Node informations, i.e. at what
59 * place in the file they were detected.
60 * NOTE: This is off by default and not very well tested.
61 */
62typedef struct _xmlParserNodeInfo xmlParserNodeInfo;
63typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;
64
65struct _xmlParserNodeInfo {
66 const struct _xmlNode* node;
67 /* Position & line # that text that created the node begins & ends on */
68 unsigned long begin_pos;
69 unsigned long begin_line;
70 unsigned long end_pos;
71 unsigned long end_line;
72};
73
74typedef struct _xmlParserNodeInfoSeq xmlParserNodeInfoSeq;
75typedef xmlParserNodeInfoSeq *xmlParserNodeInfoSeqPtr;
76struct _xmlParserNodeInfoSeq {
77 unsigned long maximum;
78 unsigned long length;
79 xmlParserNodeInfo* buffer;
80};
81
82/**
83 * The parser is now working also as a state based parser
84 * The recursive one use the stagte info for entities processing
85 */
86typedef enum {
87 XML_PARSER_EOF = -1, /* nothing is to be parsed */
88 XML_PARSER_START = 0, /* nothing has been parsed */
89 XML_PARSER_MISC, /* Misc* before int subset */
90 XML_PARSER_PI, /* Whithin a processing instruction */
91 XML_PARSER_DTD, /* within some DTD content */
92 XML_PARSER_PROLOG, /* Misc* after internal subset */
93 XML_PARSER_COMMENT, /* within a comment */
94 XML_PARSER_START_TAG, /* within a start tag */
95 XML_PARSER_CONTENT, /* within the content */
96 XML_PARSER_CDATA_SECTION, /* within a CDATA section */
97 XML_PARSER_END_TAG, /* within a closing tag */
98 XML_PARSER_ENTITY_DECL, /* within an entity declaration */
99 XML_PARSER_ENTITY_VALUE, /* within an entity value in a decl */
100 XML_PARSER_ATTRIBUTE_VALUE, /* within an attribute value */
101 XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
102 XML_PARSER_EPILOG, /* the Misc* after the last end tag */
103 XML_PARSER_IGNORE /* within an IGNORED section */
104} xmlParserInputState;
105
106/**
107 * The parser context.
108 * NOTE This doesn't completely defines the parser state, the (current ?)
109 * design of the parser uses recursive function calls since this allow
110 * and easy mapping from the production rules of the specification
111 * to the actual code. The drawback is that the actual function call
112 * also reflect the parser state. However most of the parsing routines
113 * takes as the only argument the parser context pointer, so migrating
114 * to a state based parser for progressive parsing shouldn't be too hard.
115 */
116typedef struct _xmlParserCtxt xmlParserCtxt;
117typedef xmlParserCtxt *xmlParserCtxtPtr;
118struct _xmlParserCtxt {
119 struct _xmlSAXHandler *sax; /* The SAX handler */
120 void *userData; /* For SAX interface only, used by DOM build */
121 xmlDocPtr myDoc; /* the document being built */
122 int wellFormed; /* is the document well formed */
123 int replaceEntities; /* shall we replace entities ? */
124 const xmlChar *version; /* the XML version string */
125 const xmlChar *encoding; /* the declared encoding, if any */
126 int standalone; /* standalone document */
127 int html; /* an HTML(1)/Docbook(2) document */
128
129 /* Input stream stack */
130 xmlParserInputPtr input; /* Current input stream */
131 int inputNr; /* Number of current input streams */
132 int inputMax; /* Max number of input streams */
133 xmlParserInputPtr *inputTab; /* stack of inputs */
134
135 /* Node analysis stack only used for DOM building */
136 xmlNodePtr node; /* Current parsed Node */
137 int nodeNr; /* Depth of the parsing stack */
138 int nodeMax; /* Max depth of the parsing stack */
139 xmlNodePtr *nodeTab; /* array of nodes */
140
141 int record_info; /* Whether node info should be kept */
142 xmlParserNodeInfoSeq node_seq; /* info about each node parsed */
143
144 int errNo; /* error code */
145
146 int hasExternalSubset; /* reference and external subset */
147 int hasPErefs; /* the internal subset has PE refs */
148 int external; /* are we parsing an external entity */
149
150 int valid; /* is the document valid */
151 int validate; /* shall we try to validate ? */
152 xmlValidCtxt vctxt; /* The validity context */
153
154 xmlParserInputState instate; /* current type of input */
155 int token; /* next char look-ahead */
156
157 char *directory; /* the data directory */
158
159 /* Node name stack */
160 xmlChar *name; /* Current parsed Node */
161 int nameNr; /* Depth of the parsing stack */
162 int nameMax; /* Max depth of the parsing stack */
163 xmlChar * *nameTab; /* array of nodes */
164
165 long nbChars; /* number of xmlChar processed */
166 long checkIndex; /* used by progressive parsing lookup */
167 int keepBlanks; /* ugly but ... */
168 int disableSAX; /* SAX callbacks are disabled */
169 int inSubset; /* Parsing is in int 1/ext 2 subset */
170 xmlChar * intSubName; /* name of subset */
171 xmlChar * extSubURI; /* URI of external subset */
172 xmlChar * extSubSystem; /* SYSTEM ID of external subset */
173
174 /* xml:space values */
175 int * space; /* Should the parser preserve spaces */
176 int spaceNr; /* Depth of the parsing stack */
177 int spaceMax; /* Max depth of the parsing stack */
178 int * spaceTab; /* array of space infos */
179
180 int depth; /* to prevent entity substitution loops */
181 xmlParserInputPtr entity; /* used to check entities boundaries */
182 int charset; /* encoding of the in-memory content
183 actually an xmlCharEncoding */
184 int nodelen; /* Those two fields are there to */
185 int nodemem; /* Speed up large node parsing */
186 int pedantic; /* signal pedantic warnings */
187 void *_private; /* For user data, libxml won't touch it */
188
189 int loadsubset; /* should the external subset be loaded */
190};
191
192/**
193 * a SAX Locator.
194 */
195typedef struct _xmlSAXLocator xmlSAXLocator;
196typedef xmlSAXLocator *xmlSAXLocatorPtr;
197struct _xmlSAXLocator {
198 const xmlChar *(*getPublicId)(void *ctx);
199 const xmlChar *(*getSystemId)(void *ctx);
200 int (*getLineNumber)(void *ctx);
201 int (*getColumnNumber)(void *ctx);
202};
203
204/**
205 * a SAX handler is bunch of callbacks called by the parser when processing
206 * of the input generate data or structure informations.
207 */
208
209typedef xmlParserInputPtr (*resolveEntitySAXFunc) (void *ctx,
210 const xmlChar *publicId, const xmlChar *systemId);
211typedef void (*internalSubsetSAXFunc) (void *ctx, const xmlChar *name,
212 const xmlChar *ExternalID, const xmlChar *SystemID);
213typedef void (*externalSubsetSAXFunc) (void *ctx, const xmlChar *name,
214 const xmlChar *ExternalID, const xmlChar *SystemID);
215typedef xmlEntityPtr (*getEntitySAXFunc) (void *ctx,
216 const xmlChar *name);
217typedef xmlEntityPtr (*getParameterEntitySAXFunc) (void *ctx,
218 const xmlChar *name);
219typedef void (*entityDeclSAXFunc) (void *ctx,
220 const xmlChar *name, int type, const xmlChar *publicId,
221 const xmlChar *systemId, xmlChar *content);
222typedef void (*notationDeclSAXFunc)(void *ctx, const xmlChar *name,
223 const xmlChar *publicId, const xmlChar *systemId);
224typedef void (*attributeDeclSAXFunc)(void *ctx, const xmlChar *elem,
225 const xmlChar *name, int type, int def,
226 const xmlChar *defaultValue, xmlEnumerationPtr tree);
227typedef void (*elementDeclSAXFunc)(void *ctx, const xmlChar *name,
228 int type, xmlElementContentPtr content);
229typedef void (*unparsedEntityDeclSAXFunc)(void *ctx,
230 const xmlChar *name, const xmlChar *publicId,
231 const xmlChar *systemId, const xmlChar *notationName);
232typedef void (*setDocumentLocatorSAXFunc) (void *ctx,
233 xmlSAXLocatorPtr loc);
234typedef void (*startDocumentSAXFunc) (void *ctx);
235typedef void (*endDocumentSAXFunc) (void *ctx);
236typedef void (*startElementSAXFunc) (void *ctx, const xmlChar *name,
237 const xmlChar **atts);
238typedef void (*endElementSAXFunc) (void *ctx, const xmlChar *name);
239typedef void (*attributeSAXFunc) (void *ctx, const xmlChar *name,
240 const xmlChar *value);
241typedef void (*referenceSAXFunc) (void *ctx, const xmlChar *name);
242typedef void (*charactersSAXFunc) (void *ctx, const xmlChar *ch,
243 int len);
244typedef void (*ignorableWhitespaceSAXFunc) (void *ctx,
245 const xmlChar *ch, int len);
246typedef void (*processingInstructionSAXFunc) (void *ctx,
247 const xmlChar *target, const xmlChar *data);
248typedef void (*commentSAXFunc) (void *ctx, const xmlChar *value);
249typedef void (*cdataBlockSAXFunc) (void *ctx, const xmlChar *value, int len);
250typedef void (*warningSAXFunc) (void *ctx, const char *msg, ...);
251typedef void (*errorSAXFunc) (void *ctx, const char *msg, ...);
252typedef void (*fatalErrorSAXFunc) (void *ctx, const char *msg, ...);
253typedef int (*isStandaloneSAXFunc) (void *ctx);
254typedef int (*hasInternalSubsetSAXFunc) (void *ctx);
255typedef int (*hasExternalSubsetSAXFunc) (void *ctx);
256
257typedef struct _xmlSAXHandler xmlSAXHandler;
258typedef xmlSAXHandler *xmlSAXHandlerPtr;
259struct _xmlSAXHandler {
260 internalSubsetSAXFunc internalSubset;
261 isStandaloneSAXFunc isStandalone;
262 hasInternalSubsetSAXFunc hasInternalSubset;
263 hasExternalSubsetSAXFunc hasExternalSubset;
264 resolveEntitySAXFunc resolveEntity;
265 getEntitySAXFunc getEntity;
266 entityDeclSAXFunc entityDecl;
267 notationDeclSAXFunc notationDecl;
268 attributeDeclSAXFunc attributeDecl;
269 elementDeclSAXFunc elementDecl;
270 unparsedEntityDeclSAXFunc unparsedEntityDecl;
271 setDocumentLocatorSAXFunc setDocumentLocator;
272 startDocumentSAXFunc startDocument;
273 endDocumentSAXFunc endDocument;
274 startElementSAXFunc startElement;
275 endElementSAXFunc endElement;
276 referenceSAXFunc reference;
277 charactersSAXFunc characters;
278 ignorableWhitespaceSAXFunc ignorableWhitespace;
279 processingInstructionSAXFunc processingInstruction;
280 commentSAXFunc comment;
281 warningSAXFunc warning;
282 errorSAXFunc error;
283 fatalErrorSAXFunc fatalError;
284 getParameterEntitySAXFunc getParameterEntity;
285 cdataBlockSAXFunc cdataBlock;
286 externalSubsetSAXFunc externalSubset;
287};
288
289/**
290 * External entity loaders types
291 */
292typedef xmlParserInputPtr (*xmlExternalEntityLoader)(const char *URL,
293 const char *ID,
294 xmlParserCtxtPtr context);
295
296/**
297 * Global variables: just the default SAX interface tables and XML
298 * version infos.
299 */
300LIBXML_DLL_IMPORT extern const char *xmlParserVersion;
301
302LIBXML_DLL_IMPORT extern xmlSAXLocator xmlDefaultSAXLocator;
303LIBXML_DLL_IMPORT extern xmlSAXHandler xmlDefaultSAXHandler;
304LIBXML_DLL_IMPORT extern xmlSAXHandler htmlDefaultSAXHandler;
305LIBXML_DLL_IMPORT extern xmlSAXHandler sgmlDefaultSAXHandler;
306
307/**
308 * entity substitution default behaviour.
309 */
310
311#ifdef VMS
312LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultVal;
313#define xmlSubstituteEntitiesDefaultValue xmlSubstituteEntitiesDefaultVal
314#else
315LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultValue;
316#endif
317LIBXML_DLL_IMPORT extern int xmlGetWarningsDefaultValue;
318
319
320/**
321 * Init/Cleanup
322 */
323void xmlInitParser (void);
324void xmlCleanupParser (void);
325
326/**
327 * Input functions
328 */
329int xmlParserInputRead (xmlParserInputPtr in,
330 int len);
331int xmlParserInputGrow (xmlParserInputPtr in,
332 int len);
333
334/**
335 * xmlChar handling
336 */
337xmlChar * xmlStrdup (const xmlChar *cur);
338xmlChar * xmlStrndup (const xmlChar *cur,
339 int len);
340xmlChar * xmlStrsub (const xmlChar *str,
341 int start,
342 int len);
343const xmlChar * xmlStrchr (const xmlChar *str,
344 xmlChar val);
345const xmlChar * xmlStrstr (const xmlChar *str,
346 xmlChar *val);
347const xmlChar * xmlStrcasestr (const xmlChar *str,
348 xmlChar *val);
349int xmlStrcmp (const xmlChar *str1,
350 const xmlChar *str2);
351int xmlStrncmp (const xmlChar *str1,
352 const xmlChar *str2,
353 int len);
354int xmlStrcasecmp (const xmlChar *str1,
355 const xmlChar *str2);
356int xmlStrncasecmp (const xmlChar *str1,
357 const xmlChar *str2,
358 int len);
359int xmlStrEqual (const xmlChar *str1,
360 const xmlChar *str2);
361int xmlStrlen (const xmlChar *str);
362xmlChar * xmlStrcat (xmlChar *cur,
363 const xmlChar *add);
364xmlChar * xmlStrncat (xmlChar *cur,
365 const xmlChar *add,
366 int len);
367
368/**
369 * Basic parsing Interfaces
370 */
371xmlDocPtr xmlParseDoc (xmlChar *cur);
372xmlDocPtr xmlParseMemory (char *buffer,
373 int size);
374xmlDocPtr xmlParseFile (const char *filename);
375int xmlSubstituteEntitiesDefault(int val);
376int xmlKeepBlanksDefault (int val);
377void xmlStopParser (xmlParserCtxtPtr ctxt);
378int xmlPedanticParserDefault(int val);
379
380/**
381 * Recovery mode
382 */
383xmlDocPtr xmlRecoverDoc (xmlChar *cur);
384xmlDocPtr xmlRecoverMemory (char *buffer,
385 int size);
386xmlDocPtr xmlRecoverFile (const char *filename);
387
388/**
389 * Less common routines and SAX interfaces
390 */
391int xmlParseDocument (xmlParserCtxtPtr ctxt);
392int xmlParseExtParsedEnt (xmlParserCtxtPtr ctxt);
393xmlDocPtr xmlSAXParseDoc (xmlSAXHandlerPtr sax,
394 xmlChar *cur,
395 int recovery);
396int xmlSAXUserParseFile (xmlSAXHandlerPtr sax,
397 void *user_data,
398 const char *filename);
399int xmlSAXUserParseMemory (xmlSAXHandlerPtr sax,
400 void *user_data,
401 char *buffer,
402 int size);
403xmlDocPtr xmlSAXParseMemory (xmlSAXHandlerPtr sax,
404 char *buffer,
405 int size,
406 int recovery);
407xmlDocPtr xmlSAXParseFile (xmlSAXHandlerPtr sax,
408 const char *filename,
409 int recovery);
410xmlDocPtr xmlSAXParseEntity (xmlSAXHandlerPtr sax,
411 const char *filename);
412xmlDocPtr xmlParseEntity (const char *filename);
413xmlDtdPtr xmlParseDTD (const xmlChar *ExternalID,
414 const xmlChar *SystemID);
415xmlDtdPtr xmlSAXParseDTD (xmlSAXHandlerPtr sax,
416 const xmlChar *ExternalID,
417 const xmlChar *SystemID);
418xmlDtdPtr xmlIOParseDTD (xmlSAXHandlerPtr sax,
419 xmlParserInputBufferPtr input,
420 xmlCharEncoding enc);
421int xmlParseBalancedChunkMemory(xmlDocPtr doc,
422 xmlSAXHandlerPtr sax,
423 void *user_data,
424 int depth,
425 const xmlChar *string,
426 xmlNodePtr *list);
427int xmlParseExternalEntity (xmlDocPtr doc,
428 xmlSAXHandlerPtr sax,
429 void *user_data,
430 int depth,
431 const xmlChar *URL,
432 const xmlChar *ID,
433 xmlNodePtr *list);
434int xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx,
435 const xmlChar *URL,
436 const xmlChar *ID,
437 xmlNodePtr *list);
438
439/**
440 * SAX initialization routines
441 */
442void xmlDefaultSAXHandlerInit(void);
443void htmlDefaultSAXHandlerInit(void);
444
445/**
446 * Parser contexts handling.
447 */
448void xmlInitParserCtxt (xmlParserCtxtPtr ctxt);
449void xmlClearParserCtxt (xmlParserCtxtPtr ctxt);
450void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
451void xmlSetupParserForBuffer (xmlParserCtxtPtr ctxt,
452 const xmlChar* buffer,
453 const char* filename);
454xmlParserCtxtPtr xmlCreateDocParserCtxt (xmlChar *cur);
455
456/**
457 * Reading/setting optional parsing features.
458 */
459
460int xmlGetFeaturesList (int *len,
461 const char **result);
462int xmlGetFeature (xmlParserCtxtPtr ctxt,
463 const char *name,
464 void *result);
465int xmlSetFeature (xmlParserCtxtPtr ctxt,
466 const char *name,
467 void *value);
468
469/**
470 * Interfaces for the Push mode
471 */
472xmlParserCtxtPtr xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax,
473 void *user_data,
474 const char *chunk,
475 int size,
476 const char *filename);
477int xmlParseChunk (xmlParserCtxtPtr ctxt,
478 const char *chunk,
479 int size,
480 int terminate);
481
482/**
483 * Special I/O mode
484 */
485
486xmlParserCtxtPtr xmlCreateIOParserCtxt (xmlSAXHandlerPtr sax,
487 void *user_data,
488 xmlInputReadCallback ioread,
489 xmlInputCloseCallback ioclose,
490 void *ioctx,
491 xmlCharEncoding enc);
492
493xmlParserInputPtr xmlNewIOInputStream (xmlParserCtxtPtr ctxt,
494 xmlParserInputBufferPtr input,
495 xmlCharEncoding enc);
496
497/**
498 * Node infos
499 */
500const xmlParserNodeInfo*
501 xmlParserFindNodeInfo (const xmlParserCtxt* ctxt,
502 const xmlNode* node);
503void xmlInitNodeInfoSeq (xmlParserNodeInfoSeqPtr seq);
504void xmlClearNodeInfoSeq (xmlParserNodeInfoSeqPtr seq);
505unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
506 const xmlNode* node);
507void xmlParserAddNodeInfo (xmlParserCtxtPtr ctxt,
508 const xmlParserNodeInfo* info);
509
510/*
511 * External entities handling actually implemented in xmlIO
512 */
513
514void xmlSetExternalEntityLoader(xmlExternalEntityLoader f);
515xmlExternalEntityLoader
516 xmlGetExternalEntityLoader(void);
517xmlParserInputPtr
518 xmlLoadExternalEntity (const char *URL,
519 const char *ID,
520 xmlParserCtxtPtr context);
521
522#ifdef __cplusplus
523}
524#endif
525
526#endif /* __XML_PARSER_H__ */
527