blob: 61f61aecd964ff8603258f2959f6b52a9fa1ecb8 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * parser.h : Interfaces, constants and types related to the XML parser.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifndef __XML_PARSER_H__
10#define __XML_PARSER_H__
11
12#include <libxml/tree.h>
13#include <libxml/valid.h>
14#include <libxml/xmlIO.h>
15#include <libxml/entities.h>
16
17
18#ifdef __cplusplus
19extern "C" {
20#endif
21
22/*
23 * Constants.
24 */
25#define XML_DEFAULT_VERSION "1.0"
26
27/**
28 * an xmlParserInput is an input flow for the XML processor.
29 * Each entity parsed is associated an xmlParserInput (except the
30 * few predefined ones). This is the case both for internal entities
31 * - in which case the flow is already completely in memory - or
32 * external entities - in which case we use the buf structure for
33 * progressive reading and I18N conversions to the internal UTF-8 format.
34 */
35
36typedef void (* xmlParserInputDeallocate)(xmlChar *);
37typedef struct _xmlParserInput xmlParserInput;
38typedef xmlParserInput *xmlParserInputPtr;
39struct _xmlParserInput {
40 /* Input buffer */
41 xmlParserInputBufferPtr buf; /* UTF-8 encoded buffer */
42
43 const char *filename; /* The file analyzed, if any */
44 const char *directory; /* the directory/base of teh file */
45 const xmlChar *base; /* Base of the array to parse */
46 const xmlChar *cur; /* Current char being parsed */
Daniel Veillard48b2f892001-02-25 16:11:03 +000047 const xmlChar *end; /* end of the arry to parse */
Owen Taylor3473f882001-02-23 17:55:21 +000048 int length; /* length if known */
49 int line; /* Current line */
50 int col; /* Current column */
51 int consumed; /* How many xmlChars already consumed */
52 xmlParserInputDeallocate free; /* function to deallocate the base */
53 const xmlChar *encoding; /* the encoding string for entity */
54 const xmlChar *version; /* the version string for entity */
55 int standalone; /* Was that entity marked standalone */
56};
57
58/**
59 * the parser can be asked to collect Node informations, i.e. at what
60 * place in the file they were detected.
61 * NOTE: This is off by default and not very well tested.
62 */
63typedef struct _xmlParserNodeInfo xmlParserNodeInfo;
64typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;
65
66struct _xmlParserNodeInfo {
67 const struct _xmlNode* node;
68 /* Position & line # that text that created the node begins & ends on */
69 unsigned long begin_pos;
70 unsigned long begin_line;
71 unsigned long end_pos;
72 unsigned long end_line;
73};
74
75typedef struct _xmlParserNodeInfoSeq xmlParserNodeInfoSeq;
76typedef xmlParserNodeInfoSeq *xmlParserNodeInfoSeqPtr;
77struct _xmlParserNodeInfoSeq {
78 unsigned long maximum;
79 unsigned long length;
80 xmlParserNodeInfo* buffer;
81};
82
83/**
84 * The parser is now working also as a state based parser
85 * The recursive one use the stagte info for entities processing
86 */
87typedef enum {
88 XML_PARSER_EOF = -1, /* nothing is to be parsed */
89 XML_PARSER_START = 0, /* nothing has been parsed */
90 XML_PARSER_MISC, /* Misc* before int subset */
91 XML_PARSER_PI, /* Whithin a processing instruction */
92 XML_PARSER_DTD, /* within some DTD content */
93 XML_PARSER_PROLOG, /* Misc* after internal subset */
94 XML_PARSER_COMMENT, /* within a comment */
95 XML_PARSER_START_TAG, /* within a start tag */
96 XML_PARSER_CONTENT, /* within the content */
97 XML_PARSER_CDATA_SECTION, /* within a CDATA section */
98 XML_PARSER_END_TAG, /* within a closing tag */
99 XML_PARSER_ENTITY_DECL, /* within an entity declaration */
100 XML_PARSER_ENTITY_VALUE, /* within an entity value in a decl */
101 XML_PARSER_ATTRIBUTE_VALUE, /* within an attribute value */
102 XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
103 XML_PARSER_EPILOG, /* the Misc* after the last end tag */
104 XML_PARSER_IGNORE /* within an IGNORED section */
105} xmlParserInputState;
106
107/**
108 * The parser context.
109 * NOTE This doesn't completely defines the parser state, the (current ?)
110 * design of the parser uses recursive function calls since this allow
111 * and easy mapping from the production rules of the specification
112 * to the actual code. The drawback is that the actual function call
113 * also reflect the parser state. However most of the parsing routines
114 * takes as the only argument the parser context pointer, so migrating
115 * to a state based parser for progressive parsing shouldn't be too hard.
116 */
117typedef struct _xmlParserCtxt xmlParserCtxt;
118typedef xmlParserCtxt *xmlParserCtxtPtr;
119struct _xmlParserCtxt {
120 struct _xmlSAXHandler *sax; /* The SAX handler */
121 void *userData; /* For SAX interface only, used by DOM build */
122 xmlDocPtr myDoc; /* the document being built */
123 int wellFormed; /* is the document well formed */
124 int replaceEntities; /* shall we replace entities ? */
125 const xmlChar *version; /* the XML version string */
126 const xmlChar *encoding; /* the declared encoding, if any */
127 int standalone; /* standalone document */
128 int html; /* an HTML(1)/Docbook(2) document */
129
130 /* Input stream stack */
131 xmlParserInputPtr input; /* Current input stream */
132 int inputNr; /* Number of current input streams */
133 int inputMax; /* Max number of input streams */
134 xmlParserInputPtr *inputTab; /* stack of inputs */
135
136 /* Node analysis stack only used for DOM building */
137 xmlNodePtr node; /* Current parsed Node */
138 int nodeNr; /* Depth of the parsing stack */
139 int nodeMax; /* Max depth of the parsing stack */
140 xmlNodePtr *nodeTab; /* array of nodes */
141
142 int record_info; /* Whether node info should be kept */
143 xmlParserNodeInfoSeq node_seq; /* info about each node parsed */
144
145 int errNo; /* error code */
146
147 int hasExternalSubset; /* reference and external subset */
148 int hasPErefs; /* the internal subset has PE refs */
149 int external; /* are we parsing an external entity */
150
151 int valid; /* is the document valid */
152 int validate; /* shall we try to validate ? */
153 xmlValidCtxt vctxt; /* The validity context */
154
155 xmlParserInputState instate; /* current type of input */
156 int token; /* next char look-ahead */
157
158 char *directory; /* the data directory */
159
160 /* Node name stack */
161 xmlChar *name; /* Current parsed Node */
162 int nameNr; /* Depth of the parsing stack */
163 int nameMax; /* Max depth of the parsing stack */
164 xmlChar * *nameTab; /* array of nodes */
165
166 long nbChars; /* number of xmlChar processed */
167 long checkIndex; /* used by progressive parsing lookup */
168 int keepBlanks; /* ugly but ... */
169 int disableSAX; /* SAX callbacks are disabled */
170 int inSubset; /* Parsing is in int 1/ext 2 subset */
171 xmlChar * intSubName; /* name of subset */
172 xmlChar * extSubURI; /* URI of external subset */
173 xmlChar * extSubSystem; /* SYSTEM ID of external subset */
174
175 /* xml:space values */
176 int * space; /* Should the parser preserve spaces */
177 int spaceNr; /* Depth of the parsing stack */
178 int spaceMax; /* Max depth of the parsing stack */
179 int * spaceTab; /* array of space infos */
180
181 int depth; /* to prevent entity substitution loops */
182 xmlParserInputPtr entity; /* used to check entities boundaries */
183 int charset; /* encoding of the in-memory content
184 actually an xmlCharEncoding */
185 int nodelen; /* Those two fields are there to */
186 int nodemem; /* Speed up large node parsing */
187 int pedantic; /* signal pedantic warnings */
188 void *_private; /* For user data, libxml won't touch it */
189
190 int loadsubset; /* should the external subset be loaded */
191};
192
193/**
194 * a SAX Locator.
195 */
196typedef struct _xmlSAXLocator xmlSAXLocator;
197typedef xmlSAXLocator *xmlSAXLocatorPtr;
198struct _xmlSAXLocator {
199 const xmlChar *(*getPublicId)(void *ctx);
200 const xmlChar *(*getSystemId)(void *ctx);
201 int (*getLineNumber)(void *ctx);
202 int (*getColumnNumber)(void *ctx);
203};
204
205/**
206 * a SAX handler is bunch of callbacks called by the parser when processing
207 * of the input generate data or structure informations.
208 */
209
210typedef xmlParserInputPtr (*resolveEntitySAXFunc) (void *ctx,
211 const xmlChar *publicId, const xmlChar *systemId);
212typedef void (*internalSubsetSAXFunc) (void *ctx, const xmlChar *name,
213 const xmlChar *ExternalID, const xmlChar *SystemID);
214typedef void (*externalSubsetSAXFunc) (void *ctx, const xmlChar *name,
215 const xmlChar *ExternalID, const xmlChar *SystemID);
216typedef xmlEntityPtr (*getEntitySAXFunc) (void *ctx,
217 const xmlChar *name);
218typedef xmlEntityPtr (*getParameterEntitySAXFunc) (void *ctx,
219 const xmlChar *name);
220typedef void (*entityDeclSAXFunc) (void *ctx,
221 const xmlChar *name, int type, const xmlChar *publicId,
222 const xmlChar *systemId, xmlChar *content);
223typedef void (*notationDeclSAXFunc)(void *ctx, const xmlChar *name,
224 const xmlChar *publicId, const xmlChar *systemId);
225typedef void (*attributeDeclSAXFunc)(void *ctx, const xmlChar *elem,
226 const xmlChar *name, int type, int def,
227 const xmlChar *defaultValue, xmlEnumerationPtr tree);
228typedef void (*elementDeclSAXFunc)(void *ctx, const xmlChar *name,
229 int type, xmlElementContentPtr content);
230typedef void (*unparsedEntityDeclSAXFunc)(void *ctx,
231 const xmlChar *name, const xmlChar *publicId,
232 const xmlChar *systemId, const xmlChar *notationName);
233typedef void (*setDocumentLocatorSAXFunc) (void *ctx,
234 xmlSAXLocatorPtr loc);
235typedef void (*startDocumentSAXFunc) (void *ctx);
236typedef void (*endDocumentSAXFunc) (void *ctx);
237typedef void (*startElementSAXFunc) (void *ctx, const xmlChar *name,
238 const xmlChar **atts);
239typedef void (*endElementSAXFunc) (void *ctx, const xmlChar *name);
240typedef void (*attributeSAXFunc) (void *ctx, const xmlChar *name,
241 const xmlChar *value);
242typedef void (*referenceSAXFunc) (void *ctx, const xmlChar *name);
243typedef void (*charactersSAXFunc) (void *ctx, const xmlChar *ch,
244 int len);
245typedef void (*ignorableWhitespaceSAXFunc) (void *ctx,
246 const xmlChar *ch, int len);
247typedef void (*processingInstructionSAXFunc) (void *ctx,
248 const xmlChar *target, const xmlChar *data);
249typedef void (*commentSAXFunc) (void *ctx, const xmlChar *value);
250typedef void (*cdataBlockSAXFunc) (void *ctx, const xmlChar *value, int len);
251typedef void (*warningSAXFunc) (void *ctx, const char *msg, ...);
252typedef void (*errorSAXFunc) (void *ctx, const char *msg, ...);
253typedef void (*fatalErrorSAXFunc) (void *ctx, const char *msg, ...);
254typedef int (*isStandaloneSAXFunc) (void *ctx);
255typedef int (*hasInternalSubsetSAXFunc) (void *ctx);
256typedef int (*hasExternalSubsetSAXFunc) (void *ctx);
257
258typedef struct _xmlSAXHandler xmlSAXHandler;
259typedef xmlSAXHandler *xmlSAXHandlerPtr;
260struct _xmlSAXHandler {
261 internalSubsetSAXFunc internalSubset;
262 isStandaloneSAXFunc isStandalone;
263 hasInternalSubsetSAXFunc hasInternalSubset;
264 hasExternalSubsetSAXFunc hasExternalSubset;
265 resolveEntitySAXFunc resolveEntity;
266 getEntitySAXFunc getEntity;
267 entityDeclSAXFunc entityDecl;
268 notationDeclSAXFunc notationDecl;
269 attributeDeclSAXFunc attributeDecl;
270 elementDeclSAXFunc elementDecl;
271 unparsedEntityDeclSAXFunc unparsedEntityDecl;
272 setDocumentLocatorSAXFunc setDocumentLocator;
273 startDocumentSAXFunc startDocument;
274 endDocumentSAXFunc endDocument;
275 startElementSAXFunc startElement;
276 endElementSAXFunc endElement;
277 referenceSAXFunc reference;
278 charactersSAXFunc characters;
279 ignorableWhitespaceSAXFunc ignorableWhitespace;
280 processingInstructionSAXFunc processingInstruction;
281 commentSAXFunc comment;
282 warningSAXFunc warning;
283 errorSAXFunc error;
284 fatalErrorSAXFunc fatalError;
285 getParameterEntitySAXFunc getParameterEntity;
286 cdataBlockSAXFunc cdataBlock;
287 externalSubsetSAXFunc externalSubset;
288};
289
290/**
291 * External entity loaders types
292 */
293typedef xmlParserInputPtr (*xmlExternalEntityLoader)(const char *URL,
294 const char *ID,
295 xmlParserCtxtPtr context);
296
297/**
298 * Global variables: just the default SAX interface tables and XML
299 * version infos.
300 */
301LIBXML_DLL_IMPORT extern const char *xmlParserVersion;
302
303LIBXML_DLL_IMPORT extern xmlSAXLocator xmlDefaultSAXLocator;
304LIBXML_DLL_IMPORT extern xmlSAXHandler xmlDefaultSAXHandler;
305LIBXML_DLL_IMPORT extern xmlSAXHandler htmlDefaultSAXHandler;
Daniel Veillardeae522a2001-04-23 13:41:34 +0000306LIBXML_DLL_IMPORT extern xmlSAXHandler docbDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +0000307
308/**
309 * entity substitution default behaviour.
310 */
311
312#ifdef VMS
313LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultVal;
314#define xmlSubstituteEntitiesDefaultValue xmlSubstituteEntitiesDefaultVal
315#else
316LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultValue;
317#endif
318LIBXML_DLL_IMPORT extern int xmlGetWarningsDefaultValue;
319
320
321/**
322 * Init/Cleanup
323 */
324void xmlInitParser (void);
325void xmlCleanupParser (void);
326
327/**
328 * Input functions
329 */
330int xmlParserInputRead (xmlParserInputPtr in,
331 int len);
332int xmlParserInputGrow (xmlParserInputPtr in,
333 int len);
334
335/**
336 * xmlChar handling
337 */
338xmlChar * xmlStrdup (const xmlChar *cur);
339xmlChar * xmlStrndup (const xmlChar *cur,
340 int len);
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000341xmlChar * xmlCharStrndup (const char *cur,
342 int len);
343xmlChar * xmlCharStrdup (const char *cur);
Owen Taylor3473f882001-02-23 17:55:21 +0000344xmlChar * xmlStrsub (const xmlChar *str,
345 int start,
346 int len);
347const xmlChar * xmlStrchr (const xmlChar *str,
348 xmlChar val);
349const xmlChar * xmlStrstr (const xmlChar *str,
350 xmlChar *val);
351const xmlChar * xmlStrcasestr (const xmlChar *str,
352 xmlChar *val);
353int xmlStrcmp (const xmlChar *str1,
354 const xmlChar *str2);
355int xmlStrncmp (const xmlChar *str1,
356 const xmlChar *str2,
357 int len);
358int xmlStrcasecmp (const xmlChar *str1,
359 const xmlChar *str2);
360int xmlStrncasecmp (const xmlChar *str1,
361 const xmlChar *str2,
362 int len);
363int xmlStrEqual (const xmlChar *str1,
364 const xmlChar *str2);
365int xmlStrlen (const xmlChar *str);
366xmlChar * xmlStrcat (xmlChar *cur,
367 const xmlChar *add);
368xmlChar * xmlStrncat (xmlChar *cur,
369 const xmlChar *add,
370 int len);
371
372/**
373 * Basic parsing Interfaces
374 */
375xmlDocPtr xmlParseDoc (xmlChar *cur);
376xmlDocPtr xmlParseMemory (char *buffer,
377 int size);
378xmlDocPtr xmlParseFile (const char *filename);
379int xmlSubstituteEntitiesDefault(int val);
380int xmlKeepBlanksDefault (int val);
381void xmlStopParser (xmlParserCtxtPtr ctxt);
382int xmlPedanticParserDefault(int val);
383
384/**
385 * Recovery mode
386 */
387xmlDocPtr xmlRecoverDoc (xmlChar *cur);
388xmlDocPtr xmlRecoverMemory (char *buffer,
389 int size);
390xmlDocPtr xmlRecoverFile (const char *filename);
391
392/**
393 * Less common routines and SAX interfaces
394 */
395int xmlParseDocument (xmlParserCtxtPtr ctxt);
396int xmlParseExtParsedEnt (xmlParserCtxtPtr ctxt);
397xmlDocPtr xmlSAXParseDoc (xmlSAXHandlerPtr sax,
398 xmlChar *cur,
399 int recovery);
400int xmlSAXUserParseFile (xmlSAXHandlerPtr sax,
401 void *user_data,
402 const char *filename);
403int xmlSAXUserParseMemory (xmlSAXHandlerPtr sax,
404 void *user_data,
405 char *buffer,
406 int size);
407xmlDocPtr xmlSAXParseMemory (xmlSAXHandlerPtr sax,
408 char *buffer,
409 int size,
410 int recovery);
411xmlDocPtr xmlSAXParseFile (xmlSAXHandlerPtr sax,
412 const char *filename,
413 int recovery);
414xmlDocPtr xmlSAXParseEntity (xmlSAXHandlerPtr sax,
415 const char *filename);
416xmlDocPtr xmlParseEntity (const char *filename);
417xmlDtdPtr xmlParseDTD (const xmlChar *ExternalID,
418 const xmlChar *SystemID);
419xmlDtdPtr xmlSAXParseDTD (xmlSAXHandlerPtr sax,
420 const xmlChar *ExternalID,
421 const xmlChar *SystemID);
422xmlDtdPtr xmlIOParseDTD (xmlSAXHandlerPtr sax,
423 xmlParserInputBufferPtr input,
424 xmlCharEncoding enc);
425int xmlParseBalancedChunkMemory(xmlDocPtr doc,
426 xmlSAXHandlerPtr sax,
427 void *user_data,
428 int depth,
429 const xmlChar *string,
430 xmlNodePtr *list);
431int xmlParseExternalEntity (xmlDocPtr doc,
432 xmlSAXHandlerPtr sax,
433 void *user_data,
434 int depth,
435 const xmlChar *URL,
436 const xmlChar *ID,
437 xmlNodePtr *list);
438int xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx,
439 const xmlChar *URL,
440 const xmlChar *ID,
441 xmlNodePtr *list);
442
443/**
444 * SAX initialization routines
445 */
446void xmlDefaultSAXHandlerInit(void);
447void htmlDefaultSAXHandlerInit(void);
448
449/**
450 * Parser contexts handling.
451 */
452void xmlInitParserCtxt (xmlParserCtxtPtr ctxt);
453void xmlClearParserCtxt (xmlParserCtxtPtr ctxt);
454void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
455void xmlSetupParserForBuffer (xmlParserCtxtPtr ctxt,
456 const xmlChar* buffer,
457 const char* filename);
458xmlParserCtxtPtr xmlCreateDocParserCtxt (xmlChar *cur);
459
460/**
461 * Reading/setting optional parsing features.
462 */
463
464int xmlGetFeaturesList (int *len,
465 const char **result);
466int xmlGetFeature (xmlParserCtxtPtr ctxt,
467 const char *name,
468 void *result);
469int xmlSetFeature (xmlParserCtxtPtr ctxt,
470 const char *name,
471 void *value);
472
473/**
474 * Interfaces for the Push mode
475 */
476xmlParserCtxtPtr xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax,
477 void *user_data,
478 const char *chunk,
479 int size,
480 const char *filename);
481int xmlParseChunk (xmlParserCtxtPtr ctxt,
482 const char *chunk,
483 int size,
484 int terminate);
485
486/**
487 * Special I/O mode
488 */
489
490xmlParserCtxtPtr xmlCreateIOParserCtxt (xmlSAXHandlerPtr sax,
491 void *user_data,
492 xmlInputReadCallback ioread,
493 xmlInputCloseCallback ioclose,
494 void *ioctx,
495 xmlCharEncoding enc);
496
497xmlParserInputPtr xmlNewIOInputStream (xmlParserCtxtPtr ctxt,
498 xmlParserInputBufferPtr input,
499 xmlCharEncoding enc);
500
501/**
502 * Node infos
503 */
504const xmlParserNodeInfo*
505 xmlParserFindNodeInfo (const xmlParserCtxt* ctxt,
506 const xmlNode* node);
507void xmlInitNodeInfoSeq (xmlParserNodeInfoSeqPtr seq);
508void xmlClearNodeInfoSeq (xmlParserNodeInfoSeqPtr seq);
509unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
510 const xmlNode* node);
511void xmlParserAddNodeInfo (xmlParserCtxtPtr ctxt,
512 const xmlParserNodeInfo* info);
513
514/*
515 * External entities handling actually implemented in xmlIO
516 */
517
518void xmlSetExternalEntityLoader(xmlExternalEntityLoader f);
519xmlExternalEntityLoader
520 xmlGetExternalEntityLoader(void);
521xmlParserInputPtr
522 xmlLoadExternalEntity (const char *URL,
523 const char *ID,
524 xmlParserCtxtPtr context);
525
526#ifdef __cplusplus
527}
528#endif
529
530#endif /* __XML_PARSER_H__ */
531