blob: dfec5a78c2c1006663c4b096bab532304809ee24 [file] [log] [blame]
Daniel Veillard01791d51998-07-24 19:24:09 +00001/*
2 * parser.c : an XML 1.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * $Id$
7 */
8
9#include <config.h>
10#include <stdio.h>
11#include <ctype.h>
12#include <string.h> /* for memset() only */
13#include <malloc.h>
14#include <sys/stat.h>
15#ifdef HAVE_FCNTL_H
16#include <fcntl.h>
17#endif
18#ifdef HAVE_UNISTD_H
19#include <unistd.h>
20#endif
21#ifdef HAVE_ZLIB_H
22#include <zlib.h>
23#endif
24
25#include "xml_tree.h"
26#include "xml_parser.h"
27#include "xml_entities.h"
28
29/*
30 * A few macros needed to help building the parser.
31 */
32
33#ifdef UNICODE
34/*
35 * UNICODE version of the macros. Incomplete now TODO !!!!
36 */
37#define IS_CHAR(c) \
38 (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
39 (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
40
41#define SKIP_BLANKS(p) \
42 while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
43 (*(p) == 0x3000)) (p)++;
44
45/* I'm too lazy to complete this one TODO !!!! */
46#define IS_BASECHAR(c) \
47 ((((c) >= 0x41) && ((c) <= 0x5a)) || \
48 (((c) >= 0x61) && ((c) <= 0x7a)) || \
49 (((c) >= 0xaa) && ((c) <= 0x5b)) || \
50 (((c) >= 0xc0) && ((c) <= 0xd6)) || \
51 (((c) >= 0xd8) && ((c) <= 0xf6)) || \
52 (((c) >= 0xf8) && ((c) <= 0xff)) || \
53 ((c) == 0xba))
54
55/* I'm too lazy to complete this one TODO !!!! */
56#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
57
58/* I'm too lazy to complete this one TODO !!!! */
59#define IS_COMBINING(c) 0
60
61#define IS_IGNORABLE(c) \
62 ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
63 (((c) >= 0x202a) && ((c) <= 0x202e)) || \
64 (((c) >= 0x206a) && ((c) <= 0x206f)) || \
65 ((c) == 0xfeff))
66
67#define IS_EXTENDER(c) \
68 (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
69 ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
70 ((c) == 0xec6) || ((c) == 0x3005) \
71 (((c) >= 0x3031) && ((c) <= 0x3035)) || \
72 (((c) >= 0x309b) && ((c) <= 0x309e)) || \
73 (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
74 (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
75 ((c) == 0xff9f))
76
77#define IS_IDEOGRAPHIC(c) \
78 ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
79 (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
80 (((c) >= 0x3021) && ((c) <= 0x3029)) || \
81 ((c) == 0x3007))
82
83#define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
84
85/* I'm too lazy to complete this one ! */
86#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
87#else
88/*
89 * 8bits / ASCII version of the macros.
90 */
91#define IS_CHAR(c) \
92 (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
93
94#define IS_BASECHAR(c) \
95 ((((c) >= 0x41) && ((c) <= 0x5a)) || \
96 (((c) >= 0x61) && ((c) <= 0x7a)) || \
97 (((c) >= 0xaa) && ((c) <= 0x5b)) || \
98 (((c) >= 0xc0) && ((c) <= 0xd6)) || \
99 (((c) >= 0xd8) && ((c) <= 0xf6)) || \
100 (((c) >= 0xf8) && ((c) <= 0xff)) || \
101 ((c) == 0xba))
102
103#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
104
105#define IS_LETTER(c) IS_BASECHAR(c)
106
107#define IS_COMBINING(c) 0
108
109#define IS_IGNORABLE(c) 0
110
111#define IS_EXTENDER(c) ((c) == 0xb7)
112
113#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
114#endif
115
116
117#define SKIP_EOL(p) \
118 if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
119 if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
120
121#define SKIP_BLANKS(p) \
122 while (IS_BLANK(*(p))) (p)++;
123
124#define MOVETO_ENDTAG(p) \
125 while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
126
127#define MOVETO_STARTTAG(p) \
128 while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
129
130/*
131 * Forward definition for recusive behaviour.
132 */
133xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt);
134
135/*
136 * xmlHandleData : this routine represent's the specific application
137 * behaviour when reading a piece of text.
138 *
139 * For example in WebDav, any piece made only of blanks is eliminated
140 */
141
142CHAR *xmlHandleData(CHAR *in) {
143 CHAR *cur;
144
145 if (in == NULL) return(NULL);
146 cur = in;
147 while (IS_CHAR(*cur)) {
148 if (!IS_BLANK(*cur)) goto not_blank;
149 cur++;
150 }
151 free(in);
152 return(NULL);
153
154not_blank:
155 return(in);
156}
157
158/*
159 * xmlStrndup : a strdup for array of CHAR's
160 */
161
162CHAR *xmlStrndup(const CHAR *cur, int len) {
163 CHAR *ret = malloc((len + 1) * sizeof(CHAR));
164
165 if (ret == NULL) {
166 fprintf(stderr, "malloc of %d byte failed\n",
167 (len + 1) * sizeof(CHAR));
168 return(NULL);
169 }
170 memcpy(ret, cur, len * sizeof(CHAR));
171 ret[len] = 0;
172 return(ret);
173}
174
175/*
176 * xmlStrdup : a strdup for CHAR's
177 */
178
179CHAR *xmlStrdup(const CHAR *cur) {
180 const CHAR *p = cur;
181
182 while (IS_CHAR(*p)) p++;
183 return(xmlStrndup(cur, p - cur));
184}
185
186/*
187 * xmlStrcmp : a strcmp for CHAR's
188 */
189
190int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
191 register int tmp;
192
193 do {
194 tmp = *str1++ - *str2++;
195 if (tmp != 0) return(tmp);
196 } while ((*str1 != 0) && (*str2 != 0));
197 return (*str1 - *str2);
198}
199
200/*
201 * xmlStrncmp : a strncmp for CHAR's
202 */
203
204int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
205 register int tmp;
206
207 if (len <= 0) return(0);
208 do {
209 tmp = *str1++ - *str2++;
210 if (tmp != 0) return(tmp);
211 len--;
212 if (len <= 0) return(0);
213 } while ((*str1 != 0) && (*str2 != 0));
214 return (*str1 - *str2);
215}
216
217/*
218 * xmlStrchr : a strchr for CHAR's
219 */
220
221CHAR *xmlStrchr(const CHAR *str, CHAR val) {
222 while (*str != 0) {
223 if (*str == val) return((CHAR *) str);
224 str++;
225 }
226 return(NULL);
227}
228
229/*
230 * xmlParseName : parse an XML name.
231 */
232
233CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
234 const CHAR *q;
235 CHAR *ret = NULL;
236
237 /*
238 * Name ::= (Letter | '_') (NameChar)*
239 */
240 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
241 q = ctxt->cur++;
242 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
243 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') ||
244 (ctxt->cur[0] == ':') ||
245 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
246 (IS_EXTENDER(ctxt->cur[0])))
247 ctxt->cur++;
248
249 ret = xmlStrndup(q, ctxt->cur - q);
250
251 return(ret);
252}
253
254/*
255 * Parse and return a string between quotes or doublequotes
256 */
257CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
258 CHAR *ret = NULL;
259 const CHAR *q;
260
261 if (ctxt->cur[0] == '"') {
262 ctxt->cur++;
263 q = ctxt->cur;
264 while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++;
265 if (ctxt->cur[0] != '"')
266 fprintf(stderr, "String not closed \"%.50s\n", q);
267 else {
268 ret = xmlStrndup(q, ctxt->cur - q);
269 ctxt->cur++;
270 }
271 } else if (ctxt->cur[0] == '\''){
272 ctxt->cur++;
273 q = ctxt->cur;
274 while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++;
275 if (ctxt->cur[0] != '\'')
276 fprintf(stderr, "String not closed '%.50s\n", q);
277 else {
278 ret = xmlStrndup(q, ctxt->cur - q);
279 ctxt->cur++;
280 }
281 }
282 return(ret);
283}
284
285/*
286 * Skip an XML (SGML) comment <!-- .... -->
287 *
288 * TODO !!!! Save the comment in the tree !!!
289 */
290void xmlParserSkipComment(xmlParserCtxtPtr ctxt) {
291 const CHAR *q, *start;
292 const CHAR *r;
293
294 /*
295 * An extra check may avoid errors and isn't that costly !
296 */
297 if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') ||
298 (ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return;
299
300 ctxt->cur += 4;
301 start = q = ctxt->cur;
302 ctxt->cur++;
303 r = ctxt->cur;
304 ctxt->cur++;
305 while (IS_CHAR(ctxt->cur[0]) &&
306 ((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') ||
307 (*r != '-') || (*q != '-'))) {
308 ctxt->cur++;r++;q++;
309 }
310 if (!IS_CHAR(ctxt->cur[0])) {
311 fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
312 ctxt->cur = start; /* !!! We shouldn't really try to recover !!! */
313 } else {
314 ctxt->cur++;
315 }
316}
317
318/*
319 * xmlParseNamespace: parse specific '<?namespace ...' constructs.
320 */
321
322void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
323 CHAR *href = NULL;
324 CHAR *AS = NULL;
325 int garbage = 0;
326
327 /*
328 * We just skipped "namespace" or "xml:namespace"
329 */
330 SKIP_BLANKS(ctxt->cur);
331
332 while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '>')) {
333 /*
334 * We can have "ns" or "prefix" attributes
335 * Old encoding as 'href' or 'AS' attributes is still supported
336 */
337 if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 's')) {
338 garbage = 0;
339 ctxt->cur += 2;
340 SKIP_BLANKS(ctxt->cur);
341
342 if (ctxt->cur[0] != '=') continue;
343 ctxt->cur++;
344 SKIP_BLANKS(ctxt->cur);
345
346 href = xmlParseQuotedString(ctxt);
347 SKIP_BLANKS(ctxt->cur);
348 } else if ((ctxt->cur[0] == 'h') && (ctxt->cur[1] == 'r') &&
349 (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f')) {
350 garbage = 0;
351 ctxt->cur += 4;
352 SKIP_BLANKS(ctxt->cur);
353
354 if (ctxt->cur[0] != '=') continue;
355 ctxt->cur++;
356 SKIP_BLANKS(ctxt->cur);
357
358 href = xmlParseQuotedString(ctxt);
359 SKIP_BLANKS(ctxt->cur);
360 } else if ((ctxt->cur[0] == 'p') && (ctxt->cur[1] == 'r') &&
361 (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f') &&
362 (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'x')) {
363 garbage = 0;
364 ctxt->cur += 6;
365 SKIP_BLANKS(ctxt->cur);
366
367 if (ctxt->cur[0] != '=') continue;
368 ctxt->cur++;
369 SKIP_BLANKS(ctxt->cur);
370
371 AS = xmlParseQuotedString(ctxt);
372 SKIP_BLANKS(ctxt->cur);
373 } else if ((ctxt->cur[0] == 'A') && (ctxt->cur[1] == 'S')) {
374 garbage = 0;
375 ctxt->cur += 2;
376 SKIP_BLANKS(ctxt->cur);
377
378 if (ctxt->cur[0] != '=') continue;
379 ctxt->cur++;
380 SKIP_BLANKS(ctxt->cur);
381
382 AS = xmlParseQuotedString(ctxt);
383 SKIP_BLANKS(ctxt->cur);
384 } else if ((ctxt->cur[0] == '?') && (ctxt->cur[1] == '>')) {
385 garbage = 0;
386 ctxt->cur ++;
387 } else {
388 /*
389 * Found garbage when parsing the namespace
390 */
391 if (!garbage) fprintf(stderr,
392 "\nxmlParseNamespace found garbage: ");
393 fprintf(stderr, "%c", ctxt->cur[0]);
394 ctxt->cur++;
395 }
396 }
397
398 MOVETO_ENDTAG(ctxt->cur);
399 ctxt->cur++;
400
401 /*
402 * Register the DTD.
403 */
404 if (href != NULL)
405 xmlNewDtd(ctxt->doc, href, AS);
406
407 if (AS != NULL) free(AS);
408 if (href != NULL) free(href);
409}
410
411/*
412 * xmlParsePI: parse an XML Processing Instruction.
413 */
414
415void xmlParsePI(xmlParserCtxtPtr ctxt) {
416 if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
417 /*
418 * this is a Processing Instruction.
419 */
420 ctxt->cur += 2;
421
422 /*
423 * Special for WebDav, support for the Processing Instruction
424 * '<?namespace ...' contruct in the header of the XML document.
425 */
426 if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 'a') &&
427 (ctxt->cur[2] == 'm') && (ctxt->cur[3] == 'e') &&
428 (ctxt->cur[4] == 's') && (ctxt->cur[5] == 'p') &&
429 (ctxt->cur[6] == 'a') && (ctxt->cur[7] == 'c') &&
430 (ctxt->cur[8] == 'e')) {
431 ctxt->cur += 9;
432 xmlParseNamespace(ctxt);
433 } else if ((ctxt->cur[0] == 'x') && (ctxt->cur[1] == 'm') &&
434 (ctxt->cur[2] == 'l') && (ctxt->cur[3] == ':') &&
435 (ctxt->cur[4] == 'n') && (ctxt->cur[5] == 'a') &&
436 (ctxt->cur[6] == 'm') && (ctxt->cur[7] == 'e') &&
437 (ctxt->cur[8] == 's') && (ctxt->cur[9] == 'p') &&
438 (ctxt->cur[10] == 'a') && (ctxt->cur[11] == 'c') &&
439 (ctxt->cur[12] == 'e')) {
440 ctxt->cur += 13;
441 xmlParseNamespace(ctxt);
442 } else {
443 /* Unknown PI, ignore it ! */
444 fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n",
445 ctxt->cur);
446 MOVETO_ENDTAG(ctxt->cur);
447 ctxt->cur++;
448 }
449 }
450}
451
452/*
453 * xmlParseAttribute: parse a start of tag.
454 *
455 * Attribute ::= Name Eq AttValue
456 */
457
458void xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
459 const CHAR *q;
460 CHAR *name, *value = NULL;
461
462 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
463 return;
464 }
465 q = ctxt->cur++;
466 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
467 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
468 (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
469 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
470 (IS_EXTENDER(ctxt->cur[0])))
471 ctxt->cur++;
472 name = xmlStrndup(q, ctxt->cur - q);
473
474 /*
475 * We should have the equal, we are laxist here and allow attributes
476 * without values and extra spaces.
477 */
478 SKIP_BLANKS(ctxt->cur);
479 if (ctxt->cur[0] == '=') {
480 ctxt->cur++;
481 SKIP_BLANKS(ctxt->cur);
482 if ((ctxt->cur[0] != '\'') && (ctxt->cur[0] != '"')) {
483 fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
484 q);
485 } else
486 value = xmlParseQuotedString(ctxt);
487 }
488
489 /*
490 * Add the attribute to the node.
491 */
492 if (name != NULL) {
493 xmlNewProp(node, name, value);
494 free(name);
495 }
496 if ( value != NULL )
497 free(value);
498}
499
500/*
501 * xmlParseStartTag: parse a start of tag.
502 */
503
504xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) {
505 const CHAR *q;
506 CHAR *ns, *name;
507 xmlDtdPtr dtd = NULL;
508 xmlNodePtr ret = NULL;
509
510 /*
511 * Theorically one should just parse a Name, but with the addition
512 * of the namespace needed for WebDav, it's a bit more complicated
513 * since the element name may be prefixed by a namespace prefix.
514 *
515 * QName ::= (NSPart ':')? LocalPart
516 * NSPart ::= Name
517 * LocalPart ::= Name
518 * STag ::= '<' QName (S Attribute)* S? '>'
519 *
520 * instead of :
521 *
522 * STag ::= '<' QName (S Attribute)* S? '>'
523 */
524 if (ctxt->cur[0] != '<') return(NULL);
525 ctxt->cur++;
526
527 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
528 q = ctxt->cur++;
529 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
530 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
531 (ctxt->cur[0] == '_') ||
532 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
533 (IS_EXTENDER(ctxt->cur[0])))
534 ctxt->cur++;
535
536 if (ctxt->cur[0] == ':') {
537 ns = xmlStrndup(q, ctxt->cur - q);
538
539 ctxt->cur++; /* skip the column */
540 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
541 fprintf(stderr,
542 "Start tag : no element name after namespace identifier %.20s\n",
543 q);
544 free(ns);
545 return(NULL);
546 }
547 q = ctxt->cur++;
548 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
549 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
550 (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
551 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
552 (IS_EXTENDER(ctxt->cur[0])))
553 ctxt->cur++;
554 name = xmlStrndup(q, ctxt->cur - q);
555
556 /*
557 * Search the DTD associated to ns.
558 */
559 dtd = xmlSearchDtd(ctxt->doc, ns);
560 if (dtd == NULL)
561 fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
562 free(ns);
563 } else
564 name = xmlStrndup(q, ctxt->cur - q);
565
566 ret = xmlNewNode(dtd, name, NULL);
567
568 /*
569 * Now parse the attributes, it ends up with the ending
570 *
571 * (S Attribute)* S?
572 */
573 SKIP_BLANKS(ctxt->cur);
574 while ((IS_CHAR(ctxt->cur[0])) &&
575 (ctxt->cur[0] != '>') &&
576 ((ctxt->cur[0] != '/') || (ctxt->cur[1] != '>'))) {
577 if (IS_LETTER(ctxt->cur[0]) || (ctxt->cur[0] == '_'))
578 xmlParseAttribute(ctxt, ret);
579 else {
580 /* We should warn TODO !!! */
581 ctxt->cur++;
582 }
583 SKIP_BLANKS(ctxt->cur);
584 }
585
586 return(ret);
587}
588
589/*
590 * xmlParseEndTag: parse an end of tag, note that the '</' part has
591 * already been read.
592 */
593
594void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
595 const CHAR *q;
596 CHAR *ns, *name;
597 xmlDtdPtr dtd = NULL;
598
599 *dtdPtr = NULL;
600 *tagPtr = NULL;
601
602 /*
603 * Theorically one should just parse a Name, but with the addition
604 * of the namespace needed for WebDav, it's a bit more complicated
605 * since the element name may be prefixed by a namespace prefix.
606 *
607 * QName ::= (NSPart ':')? LocalPart
608 * NSPart ::= Name
609 * LocalPart ::= Name
610 * ETag ::= '</' QName S? '>'
611 *
612 * instead of :
613 *
614 * ETag ::= '</' Name S? '>'
615 */
616 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return;
617 q = ctxt->cur++;
618 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
619 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
620 (ctxt->cur[0] == '_') ||
621 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
622 (IS_EXTENDER(ctxt->cur[0])))
623 ctxt->cur++;
624
625 if (ctxt->cur[0] == ':') {
626 ns = xmlStrndup(q, ctxt->cur - q);
627
628 ctxt->cur++; /* skip the column */
629 if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
630 fprintf(stderr,
631 "End tag : no element name after namespace identifier %.20s\n",
632 q);
633 free(ns);
634 return;
635 }
636 q = ctxt->cur++;
637 while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
638 (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
639 (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
640 (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
641 (IS_EXTENDER(ctxt->cur[0])))
642 ctxt->cur++;
643 name = xmlStrndup(q, ctxt->cur - q);
644
645 /*
646 * Search the DTD associated to ns.
647 */
648 dtd = xmlSearchDtd(ctxt->doc, ns);
649 if (dtd == NULL)
650 fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
651 free(ns);
652 } else
653 name = xmlStrndup(q, ctxt->cur - q);
654
655 *dtdPtr = dtd;
656 *tagPtr = name;
657
658 /*
659 * We should definitely be at the ending "S? '>'" part
660 */
661 SKIP_BLANKS(ctxt->cur);
662 if ((!IS_CHAR(ctxt->cur[0])) || (ctxt->cur[0] != '>')) {
663 fprintf(stderr, "End tag : expected '>', got %.20s\n", ctxt->cur);
664 /*
665 * Note : skipping to the next '>' is probably otherkill,
666 * especially in case the '>' is hust missing.
667 *
668 * Otherwise add:
669 * MOVETO_ENDTAG(ctxt->cur);
670 */
671 } else
672 ctxt->cur++;
673
674 return;
675}
676
677/*
678 * xmlParseCDSect: escaped pure raw content.
679 */
680CHAR *xmlParseCDSect(xmlParserCtxtPtr ctxt) {
681 const CHAR *r, *s, *base;
682 CHAR *ret;
683
684 base = ctxt->cur;
685 if (!IS_CHAR(ctxt->cur[0])) {
686 fprintf(stderr, "CData section not finished : %.20s\n", base);
687 return(NULL);
688 }
689 r = ctxt->cur++;
690 if (!IS_CHAR(ctxt->cur[0])) {
691 fprintf(stderr, "CData section not finished : %.20s\n", base);
692 return(NULL);
693 }
694 s = ctxt->cur++;
695 while (IS_CHAR(ctxt->cur[0]) &&
696 ((*r != ']') || (*s != ']') || (ctxt->cur[0] != '>'))) {
697 r++;s++;ctxt->cur++;
698 }
699 if (!IS_CHAR(ctxt->cur[0])) {
700 fprintf(stderr, "CData section not finished : %.20s\n", base);
701 return(NULL);
702 }
703 ret = xmlStrndup(base, ctxt->cur-base);
704
705 return(ret);
706}
707
708/*
709 * xmlParseContent: a content is
710 * (element | PCData | Reference | CDSect | PI | Comment)
711 *
712 * element : starts by '<'
713 * PCData : any CHAR but '&' or '<'
714 * Reference : starts by '&'
715 * CDSect : starts by '<![CDATA['
716 * PI : starts by '<?'
717 */
718
719xmlNodePtr xmlParseContent(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
720 const CHAR *q;
721 CHAR *data = NULL;
722 xmlNodePtr ret = NULL;
723
724 /*
725 * First case : a Processing Instruction.
726 */
727 if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
728 xmlParsePI(ctxt);
729 }
730 /*
731 * Second case : a CDSection
732 */
733 if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
734 (ctxt->cur[2] == '[') && (ctxt->cur[3] == 'C') &&
735 (ctxt->cur[4] == 'D') && (ctxt->cur[5] == 'A') &&
736 (ctxt->cur[6] == 'T') && (ctxt->cur[7] == 'A') &&
737 (ctxt->cur[8] == '[')) {
738 ctxt->cur += 9;
739 data = xmlParseCDSect(ctxt);
740 }
741 /*
742 * Third case : a sub-element.
743 */
744 else if (ctxt->cur[0] == '<') {
745 ret = xmlParseElement(ctxt);
746 }
747 /*
748 * Last case, text. Note that References are handled directly.
749 */
750 else {
751 q = ctxt->cur;
752 while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '<')) ctxt->cur++;
753
754 if (!IS_CHAR(ctxt->cur[0])) {
755 fprintf(stderr, "Truncated content : %.50s\n", q);
756 return(NULL);
757 }
758
759 /*
760 * Do the Entities decoding...
761 */
762 data = xmlStrdup(xmlDecodeEntities(ctxt->doc, q, ctxt->cur - q));
763 }
764
765 /*
766 * Handle the data if any. If there is no child
767 * add it as content, otherwise create a new node of type text.
768 */
769 if (data != NULL)
770 data = xmlHandleData(data);
771 if (data != NULL) {
772 if (node->childs == NULL)
773 xmlNodeSetContent(node, data);
774 else
775 ret = xmlNewText(data);
776 free(data);
777 }
778
779 return(ret);
780}
781
782/*
783 * xmlParseElement: parse an XML element
784 */
785
786xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) {
787 xmlNodePtr ret, child;
788 const CHAR *openTag = ctxt->cur;
789 const CHAR *closeTag = ctxt->cur;
790
791 ret = xmlParseStartTag(ctxt);
792 if (ret == NULL) {
793 return(NULL);
794 }
795
796 /*
797 * Check for an Empty Element.
798 */
799 if ((ctxt->cur[0] == '/') && (ctxt->cur[1] == '>')) {
800 ctxt->cur += 2;
801 return(ret);
802 }
803 if (ctxt->cur[0] == '>') ctxt->cur++;
804 else {
805 fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", openTag);
806 return(NULL);
807 }
808
809 /*
810 * Parse the content of the element:
811 * (element | PCData | Reference | CDSect | PI | Comment) *
812 *
813 * element : starts by '<'
814 * PCData : any CHAR but '&' or '<'
815 * Reference : starts by '&'
816 * CDSect : starts by '<![CDATA['
817 * PI : starts by '<?'
818 *
819 * The loop stops upon detection of an end of tag '</'
820 */
821 while ((IS_CHAR(ctxt->cur[0])) &&
822 ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '/'))) {
823 child = xmlParseContent(ctxt, ret);
824 if (child != NULL)
825 xmlAddChild(ret, child);
826 }
827 if (!IS_CHAR(ctxt->cur[0])) {
828 fprintf(stderr, "Premature end of data in tag %.30s\n", openTag);
829 return(NULL);
830 }
831
832 /*
833 * parse the end of tag : '</' has been detected.
834 */
835 ctxt->cur += 2;
836 if (ctxt->cur[0] == '>') ctxt->cur++; /* simplified closing </> */
837 else {
838 CHAR *endTag;
839 xmlDtdPtr endDtd;
840
841 xmlParseEndTag(ctxt, &endDtd, &endTag);
842
843 /*
844 * Check that the Name in the ETag is the same as in the STag.
845 */
846 if (endDtd != ret->dtd) {
847 fprintf(stderr, "Start and End tags don't use the same DTD:\n");
848 fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
849 }
850 if (strcmp(ret->name, endTag)) {
851 fprintf(stderr, "Start and End tags don't use the same name:\n");
852 fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
853 }
854
855 if ( endTag != NULL )
856 free(endTag);
857 }
858
859 return(ret);
860}
861
862/*
863 * xmlParseXMLDecl: parse an XML declaration header
864 */
865
866void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
867 CHAR *version;
868
869 /*
870 * We know that '<?xml' is here.
871 */
872 ctxt->cur += 5;
873
874 /*
875 * Parse the version info
876 */
877 SKIP_BLANKS(ctxt->cur);
878
879 /*
880 * We should have 'version=' here !
881 */
882 if ((ctxt->cur[0] == 'v') && (ctxt->cur[1] == 'e') &&
883 (ctxt->cur[2] == 'r') && (ctxt->cur[3] == 's') &&
884 (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'o') &&
885 (ctxt->cur[6] == 'n') && (ctxt->cur[7] == '=')) {
886 ctxt->cur += 8;
887 version = xmlParseQuotedString(ctxt);
888 if (version == NULL)
889 ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
890 else {
891 ctxt->doc = xmlNewDoc(version);
892 free(version);
893 }
894 } else {
895 ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
896 }
897
898 /*
899 * We should check for Required Markup Declaration TODO !!!!
900 */
901 MOVETO_ENDTAG(ctxt->cur);
902 ctxt->cur++;
903
904}
905
906/*
907 * xmlParseMisc: parse an XML Misc optionnal field.
908 * (Comment | PI | S)*
909 */
910
911void xmlParseMisc(xmlParserCtxtPtr ctxt) {
912 while (((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) ||
913 ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
914 (ctxt->cur[2] == '-') && (ctxt->cur[2] == '-')) ||
915 IS_BLANK(ctxt->cur[0])) {
916 if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
917 xmlParsePI(ctxt);
918 } else if (IS_BLANK(ctxt->cur[0])) {
919 ctxt->cur++;
920 } else
921 xmlParserSkipComment(ctxt);
922 }
923}
924
925/*
926 * xmlParseDocument : parse an XML document and build a tree.
927 */
928
929int xmlParseDocument(xmlParserCtxtPtr ctxt) {
930 /*
931 * We should check for encoding here and plug-in some
932 * conversion code TODO !!!!
933 */
934
935 /*
936 * Wipe out everything which is before the first '<'
937 */
938 SKIP_BLANKS(ctxt->cur);
939
940 /*
941 * Check for the XMLDecl in the Prolog.
942 */
943 if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
944 (ctxt->cur[2] == 'x') && (ctxt->cur[3] == 'm') &&
945 (ctxt->cur[4] == 'l')) {
946 xmlParseXMLDecl(ctxt);
947 /* SKIP_EOL(cur); */
948 SKIP_BLANKS(ctxt->cur);
949 } else if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
950 (ctxt->cur[2] == 'X') && (ctxt->cur[3] == 'M') &&
951 (ctxt->cur[4] == 'L')) {
952 /*
953 * The first drafts were using <?XML and the final W3C REC
954 * now use <?xml ...
955 */
956 xmlParseXMLDecl(ctxt);
957 /* SKIP_EOL(cur); */
958 SKIP_BLANKS(ctxt->cur);
959 } else {
960 ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
961 }
962
963 /*
964 * The Misc part of the Prolog
965 * (Comment | PI | S) *
966 */
967 xmlParseMisc(ctxt);
968
969 /*
970 * Time to start parsing
971 */
972 ctxt->doc->root = xmlParseElement(ctxt);
973
974 return(0);
975}
976
977/*
978 * xmlParseDoc : parse an XML in-memory document and build a tree.
979 */
980
981xmlDocPtr xmlParseDoc(CHAR *cur) {
982 xmlDocPtr ret;
983 xmlParserCtxtPtr ctxt;
984
985 if (cur == NULL) return(NULL);
986
987 ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
988 if (ctxt == NULL) {
989 perror("malloc");
990 return(NULL);
991 }
992
993 xmlInitParserCtxt(ctxt);
994 ctxt->base = cur;
995 ctxt->cur = cur;
996
997 xmlParseDocument(ctxt);
998 ret = ctxt->doc;
999 free(ctxt->nodes);
1000 free(ctxt);
1001
1002 return(ret);
1003}
1004
1005/*
1006 * xmlParseFile : parse an XML file and build a tree.
1007 */
1008
1009xmlDocPtr xmlParseFile(const char *filename) {
1010 xmlDocPtr ret;
1011#ifdef HAVE_ZLIB_H
1012 gzFile input;
1013#else
1014 int input;
1015#endif
1016 int res;
1017 struct stat buf;
1018 char *buffer;
1019 xmlParserCtxtPtr ctxt;
1020
1021 res = stat(filename, &buf);
1022 if (res < 0) return(NULL);
1023
1024#ifdef HAVE_ZLIB_H
1025retry_bigger:
1026 buffer = malloc((buf.st_size * 20) + 100);
1027#else
1028 buffer = malloc(buf.st_size + 100);
1029#endif
1030 if (buffer == NULL) {
1031 perror("malloc");
1032 return(NULL);
1033 }
1034
1035 memset(buffer, 0, sizeof(buffer));
1036#ifdef HAVE_ZLIB_H
1037 input = gzopen (filename, "r");
1038 if (input == NULL) {
1039 fprintf (stderr, "Cannot read file %s :\n", filename);
1040 perror ("gzopen failed");
1041 return(NULL);
1042 }
1043#else
1044 input = open (filename, O_RDONLY);
1045 if (input < 0) {
1046 fprintf (stderr, "Cannot read file %s :\n", filename);
1047 perror ("open failed");
1048 return(NULL);
1049 }
1050#endif
1051#ifdef HAVE_ZLIB_H
1052 res = gzread(input, buffer, 20 * buf.st_size);
1053#else
1054 res = read(input, buffer, buf.st_size);
1055#endif
1056 if (res < 0) {
1057 fprintf (stderr, "Cannot read file %s :\n", filename);
1058#ifdef HAVE_ZLIB_H
1059 perror ("gzread failed");
1060#else
1061 perror ("read failed");
1062#endif
1063 return(NULL);
1064 }
1065#ifdef HAVE_ZLIB_H
1066 gzclose(input);
1067 if (res >= 20 * buf.st_size) {
1068 free(buffer);
1069 buf.st_size *= 2;
1070 goto retry_bigger;
1071 }
1072 buf.st_size = res;
1073#else
1074 close(input);
1075#endif
1076
1077
1078 ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
1079 if (ctxt == NULL) {
1080 perror("malloc");
1081 return(NULL);
1082 }
1083 buffer[buf.st_size] = '\0';
1084
1085 xmlInitParserCtxt(ctxt);
1086 ctxt->filename = filename;
1087 ctxt->base = buffer;
1088 ctxt->cur = buffer;
1089
1090 xmlParseDocument(ctxt);
1091 ret = ctxt->doc;
1092 free(buffer);
1093 free(ctxt->nodes);
1094 free(ctxt);
1095
1096 return(ret);
1097}
1098
1099/*
1100 * xmlParseFile : parse an XML memory block and build a tree.
1101 */
1102
1103xmlDocPtr xmlParseMemory(char *buffer, int size) {
1104 xmlDocPtr ret;
1105 xmlParserCtxtPtr ctxt;
1106
1107 ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
1108 if (ctxt == NULL) {
1109 perror("malloc");
1110 return(NULL);
1111 }
1112
1113 buffer[size - 1] = '\0';
1114
1115 xmlInitParserCtxt(ctxt);
1116 ctxt->base = buffer;
1117 ctxt->cur = buffer;
1118
1119 xmlParseDocument(ctxt);
1120 ret = ctxt->doc;
1121 free(ctxt->nodes);
1122 free(ctxt);
1123
1124 return(ret);
1125}
1126
1127
1128
1129
1130/* Initialize parser context */
1131void xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
1132{
1133 int i;
1134
1135 ctxt->filename = NULL;
1136 ctxt->base = NULL;
1137 ctxt->cur = NULL;
1138 ctxt->line = 1;
1139 ctxt->col = 1;
1140 ctxt->doc = NULL;
1141 ctxt->depth = 0;
1142 ctxt->max_depth = 10;
1143 ctxt->nodes = (xmlNodePtr *) malloc(ctxt->max_depth * sizeof(xmlNodePtr));
1144 if (ctxt->nodes == NULL) {
1145 fprintf(stderr, "malloc of %d byte failed\n",
1146 ctxt->max_depth * sizeof(xmlNodePtr));
1147 ctxt->max_depth = 0;
1148 } else {
1149 for (i = 0;i < ctxt->max_depth;i++)
1150 ctxt->nodes[i] = NULL;
1151 }
1152}
1153
1154
1155/*
1156 * Clear (release owned resources) and reinitialize context
1157 */
1158void xmlClearParserCtxt(xmlParserCtxtPtr ctx)
1159{
1160 xmlInitParserCtxt(ctx);
1161}
1162
1163
1164/*
1165 * Setup the parser context to parse a new buffer; Clears any prior
1166 * contents from the parser context. The buffer parameter must not be
1167 * NULL, but the filename parameter can be
1168 */
1169void xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer,
1170 const char* filename)
1171{
1172 xmlClearParserCtxt(ctxt);
1173 ctxt->base = buffer;
1174 ctxt->cur = buffer;
1175 ctxt->filename = filename;
1176}
1177
1178
1179
1180void xmlReportError(xmlParserCtxtPtr ctx, const CHAR* msg)
1181{
1182 fputs(msg, stderr);
1183}