Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | * HTMLparser.h : inf=terface for an HTML 4.0 non-verifying parser |
| 3 | * |
| 4 | * See Copyright for the status of this software. |
| 5 | * |
| 6 | * Daniel.Veillard@w3.org |
| 7 | */ |
| 8 | |
| 9 | #ifndef __HTML_PARSER_H__ |
| 10 | #define __HTML_PARSER_H__ |
Daniel Veillard | 361d845 | 2000-04-03 19:48:13 +0000 | [diff] [blame] | 11 | #include <libxml/parser.h> |
Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 12 | |
Daniel Veillard | f600e25 | 1999-12-18 15:32:46 +0000 | [diff] [blame] | 13 | #ifdef __cplusplus |
Daniel Veillard | 5cb5ab8 | 1999-12-21 15:35:29 +0000 | [diff] [blame] | 14 | extern "C" { |
Daniel Veillard | f600e25 | 1999-12-18 15:32:46 +0000 | [diff] [blame] | 15 | #endif |
| 16 | |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 17 | /* |
| 18 | * Most of the back-end structures from XML and HTML are shared |
| 19 | */ |
Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 20 | typedef xmlParserCtxt htmlParserCtxt; |
| 21 | typedef xmlParserCtxtPtr htmlParserCtxtPtr; |
| 22 | typedef xmlParserNodeInfo htmlParserNodeInfo; |
| 23 | typedef xmlSAXHandler htmlSAXHandler; |
| 24 | typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; |
| 25 | typedef xmlParserInput htmlParserInput; |
| 26 | typedef xmlParserInputPtr htmlParserInputPtr; |
| 27 | typedef xmlDocPtr htmlDocPtr; |
| 28 | typedef xmlNodePtr htmlNodePtr; |
| 29 | |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 30 | /* |
| 31 | * Internal description of an HTML element |
| 32 | */ |
Daniel Veillard | 71b656e | 2000-01-05 14:46:17 +0000 | [diff] [blame] | 33 | typedef struct _htmlElemDesc htmlElemDesc; |
| 34 | typedef htmlElemDesc *htmlElemDescPtr; |
| 35 | struct _htmlElemDesc { |
Daniel Veillard | b96e643 | 1999-08-29 21:02:19 +0000 | [diff] [blame] | 36 | const char *name; /* The tag name */ |
Daniel Veillard | f41fbbf | 2001-02-13 17:05:35 +0000 | [diff] [blame] | 37 | char startTag; /* Whether the start tag can be implied */ |
| 38 | char endTag; /* Whether the end tag can be implied */ |
| 39 | char saveEndTag; /* Whether the end tag should be saved */ |
| 40 | char empty; /* Is this an empty element ? */ |
| 41 | char depr; /* Is this a deprecated element ? */ |
| 42 | char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 43 | const char *desc; /* the description */ |
Daniel Veillard | 71b656e | 2000-01-05 14:46:17 +0000 | [diff] [blame] | 44 | }; |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 45 | |
| 46 | /* |
| 47 | * Internal description of an HTML entity |
| 48 | */ |
Daniel Veillard | 71b656e | 2000-01-05 14:46:17 +0000 | [diff] [blame] | 49 | typedef struct _htmlEntityDesc htmlEntityDesc; |
| 50 | typedef htmlEntityDesc *htmlEntityDescPtr; |
| 51 | struct _htmlEntityDesc { |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 52 | int value; /* the UNICODE value for the character */ |
Daniel Veillard | b96e643 | 1999-08-29 21:02:19 +0000 | [diff] [blame] | 53 | const char *name; /* The entity name */ |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 54 | const char *desc; /* the description */ |
Daniel Veillard | 71b656e | 2000-01-05 14:46:17 +0000 | [diff] [blame] | 55 | }; |
Daniel Veillard | 5233ffc | 1999-07-06 22:25:25 +0000 | [diff] [blame] | 56 | |
| 57 | /* |
| 58 | * There is only few public functions. |
| 59 | */ |
Daniel Veillard | 5cb5ab8 | 1999-12-21 15:35:29 +0000 | [diff] [blame] | 60 | htmlElemDescPtr htmlTagLookup (const xmlChar *tag); |
| 61 | htmlEntityDescPtr htmlEntityLookup(const xmlChar *name); |
Daniel Veillard | 47f3f31 | 2000-08-27 22:40:15 +0000 | [diff] [blame] | 62 | htmlEntityDescPtr htmlEntityValueLookup(int value); |
Daniel Veillard | 82150d8 | 1999-07-07 07:32:15 +0000 | [diff] [blame] | 63 | |
Daniel Veillard | 5cb5ab8 | 1999-12-21 15:35:29 +0000 | [diff] [blame] | 64 | int htmlIsAutoClosed(htmlDocPtr doc, |
| 65 | htmlNodePtr elem); |
| 66 | int htmlAutoCloseTag(htmlDocPtr doc, |
| 67 | const xmlChar *name, |
| 68 | htmlNodePtr elem); |
| 69 | htmlEntityDescPtr htmlParseEntityRef(htmlParserCtxtPtr ctxt, |
| 70 | xmlChar **str); |
| 71 | int htmlParseCharRef(htmlParserCtxtPtr ctxt); |
| 72 | void htmlParseElement(htmlParserCtxtPtr ctxt); |
Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 73 | |
Daniel Veillard | 5cb5ab8 | 1999-12-21 15:35:29 +0000 | [diff] [blame] | 74 | htmlDocPtr htmlSAXParseDoc (xmlChar *cur, |
| 75 | const char *encoding, |
| 76 | htmlSAXHandlerPtr sax, |
| 77 | void *userData); |
| 78 | htmlDocPtr htmlParseDoc (xmlChar *cur, |
| 79 | const char *encoding); |
| 80 | htmlDocPtr htmlSAXParseFile(const char *filename, |
| 81 | const char *encoding, |
| 82 | htmlSAXHandlerPtr sax, |
| 83 | void *userData); |
| 84 | htmlDocPtr htmlParseFile (const char *filename, |
| 85 | const char *encoding); |
Daniel Veillard | 32bc74e | 2000-07-14 14:49:25 +0000 | [diff] [blame] | 86 | int UTF8ToHtml (unsigned char* out, |
| 87 | int *outlen, |
| 88 | const unsigned char* in, |
| 89 | int *inlen); |
Daniel Veillard | e010c17 | 2000-08-28 10:04:51 +0000 | [diff] [blame] | 90 | int htmlEncodeEntities(unsigned char* out, |
| 91 | int *outlen, |
| 92 | const unsigned char* in, |
| 93 | int *inlen, int quoteChar); |
Daniel Veillard | 47e12f2 | 2000-10-15 14:24:25 +0000 | [diff] [blame] | 94 | int htmlIsScriptAttribute(const xmlChar *name); |
Daniel Veillard | a6d8eb6 | 2000-12-27 10:46:47 +0000 | [diff] [blame] | 95 | int htmlHandleOmittedElem(int val); |
Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 96 | |
Daniel Veillard | 5e5c623 | 1999-12-29 12:49:06 +0000 | [diff] [blame] | 97 | /** |
| 98 | * Interfaces for the Push mode |
| 99 | */ |
| 100 | void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); |
| 101 | htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, |
| 102 | void *user_data, |
| 103 | const char *chunk, |
| 104 | int size, |
| 105 | const char *filename, |
| 106 | xmlCharEncoding enc); |
| 107 | int htmlParseChunk (htmlParserCtxtPtr ctxt, |
| 108 | const char *chunk, |
| 109 | int size, |
| 110 | int terminate); |
Daniel Veillard | f600e25 | 1999-12-18 15:32:46 +0000 | [diff] [blame] | 111 | #ifdef __cplusplus |
| 112 | } |
| 113 | #endif |
| 114 | |
Daniel Veillard | be70ff7 | 1999-07-05 16:50:46 +0000 | [diff] [blame] | 115 | #endif /* __HTML_PARSER_H__ */ |