Keep non-significant blanks node in HTML parser
For https://bugzilla.gnome.org/show_bug.cgi?id=681822
Regardless if the option HTML_PARSE_NOBLANKS is set or not, blank nodes
are removed from a HTML document, for example:
<html>
<head>
<title>This is a test.</title>
</head>
<body>
<p>This is a test.</p>
</body>
</html>
is read as:
<html><head><title>This is a test.</title></head><body>
<p>This is a test.</p>
</body></html>
This changes the default behaviour but the old behaviour is available
as expected when using the parser flag HTML_PARSE_NOBLANKS
Based on original patch from Igor Ignatyuk <igor_ignatiouk@hotmail.com>
* HTMLparser.c: change various places in the parser where ignorable_space
SAX callback was called without checking for the parser flag preference
* xmllint.c: make sure we use the new flag even for HTML parsing
* result/HTML/*: this modifies the output of a number of tests
diff --git a/HTMLparser.c b/HTMLparser.c
index 09a9a4b..a2976f0 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2981,9 +2981,14 @@
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData,
- buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
@@ -3014,8 +3019,14 @@
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
@@ -5687,9 +5698,15 @@
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) {
if (IS_BLANK_CH(cur)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(
- ctxt->userData, &cur, 1);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(
+ ctxt->userData, &cur, 1);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(
+ ctxt->userData, &cur, 1);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)