blob: b4787c233ef13703146bd1d3e0d98f1fa66dd469 [file] [log] [blame]
Ben Murdoche69819b2013-07-17 14:56:49 +01001/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31// How we handle the base tag better.
32// Current status:
33// At now the normal way we use to handling base tag is
34// a) For those links which have corresponding local saved files, such as
35// savable CSS, JavaScript files, they will be written to relative URLs which
36// point to local saved file. Why those links can not be resolved as absolute
37// file URLs, because if they are resolved as absolute URLs, after moving the
38// file location from one directory to another directory, the file URLs will
39// be dead links.
40// b) For those links which have not corresponding local saved files, such as
41// links in A, AREA tags, they will be resolved as absolute URLs.
42// c) We comment all base tags when serialzing DOM for the page.
43// FireFox also uses above way to handle base tag.
44//
45// Problem:
46// This way can not handle the following situation:
47// the base tag is written by JavaScript.
48// For example. The page "www.yahoo.com" use
49// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50// of page when loading page. So when saving page as completed-HTML, we assume
51// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52// completed-HTML page, then the JavaScript will insert a base tag
53// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54// local saved resource files will be resolved as
55// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
56// files can not be loaded correctly. Also the page will be rendered ugly since
57// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58// files can not be fetched.
59// Now FireFox, IE and WebKit based Browser all have this problem.
60//
61// Solution:
62// My solution is that we comment old base tag and write new base tag:
63// <base href="." ...> after the previous commented base tag. In WebKit, it
64// always uses the latest "href" attribute of base tag to set document's base
65// URL. Based on this behavior, when we encounter a base tag, we comment it and
66// write a new base tag <base href="."> after the previous commented base tag.
67// The new added base tag can help engine to locate correct base URL for
68// correctly loading local saved resource files. Also I think we need to inherit
69// the base target value from document object when appending new base tag.
70// If there are multiple base tags in original document, we will comment all old
71// base tags and append new base tag after each old base tag because we do not
72// know those old base tags are original content or added by JavaScript. If
73// they are added by JavaScript, it means when loading saved page, the script(s)
74// will still insert base tag(s) to DOM, so the new added base tag(s) can
75// override the incorrect base URL and make sure we alway load correct local
76// saved resource files.
77
78#include "config.h"
79#include "WebPageSerializerImpl.h"
80
Ben Murdoche69819b2013-07-17 14:56:49 +010081#include "HTMLNames.h"
82#include "WebFrameImpl.h"
83#include "core/dom/Document.h"
84#include "core/dom/DocumentType.h"
85#include "core/dom/Element.h"
86#include "core/editing/markup.h"
87#include "core/html/HTMLAllCollection.h"
88#include "core/html/HTMLElement.h"
89#include "core/html/HTMLFormElement.h"
90#include "core/html/HTMLHtmlElement.h"
91#include "core/html/HTMLMetaElement.h"
92#include "core/loader/DocumentLoader.h"
93#include "core/loader/FrameLoader.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010094#include "public/platform/WebVector.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010095#include "wtf/text/TextEncoding.h"
96
97using namespace WebCore;
98
Torne (Richard Coles)51b29062013-11-28 11:56:03 +000099namespace blink {
Ben Murdoche69819b2013-07-17 14:56:49 +0100100
101// Maximum length of data buffer which is used to temporary save generated
102// html content data. This is a soft limit which might be passed if a very large
103// contegious string is found in the page.
104static const unsigned dataBufferCapacity = 65536;
105
106WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
107 const WTF::TextEncoding& textEncoding,
108 Document* document,
109 const String& directoryName)
110 : url(url)
111 , textEncoding(textEncoding)
112 , document(document)
113 , directoryName(directoryName)
114 , isHTMLDocument(document->isHTMLDocument())
115 , haveSeenDocType(false)
116 , haveAddedCharsetDeclaration(false)
117 , skipMetaElement(0)
118 , isInScriptOrStyleTag(false)
119 , haveAddedXMLProcessingDirective(false)
120 , haveAddedContentsBeforeEnd(false)
121{
122}
123
124String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
125 const Element* element, SerializeDomParam* param, bool* needSkip)
126{
127 StringBuilder result;
128
129 *needSkip = false;
130 if (param->isHTMLDocument) {
131 // Skip the open tag of original META tag which declare charset since we
132 // have overrided the META which have correct charset declaration after
133 // serializing open tag of HEAD element.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000134 ASSERT(element);
135 if (isHTMLMetaElement(*element)) {
136 const HTMLMetaElement& meta = toHTMLMetaElement(*element);
Ben Murdoche69819b2013-07-17 14:56:49 +0100137 // Check whether the META tag has declared charset or not.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000138 String equiv = meta.httpEquiv();
Ben Murdoche69819b2013-07-17 14:56:49 +0100139 if (equalIgnoringCase(equiv, "content-type")) {
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000140 String content = meta.content();
Ben Murdoche69819b2013-07-17 14:56:49 +0100141 if (content.length() && content.contains("charset", false)) {
142 // Find META tag declared charset, we need to skip it when
143 // serializing DOM.
144 param->skipMetaElement = element;
145 *needSkip = true;
146 }
147 }
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000148 } else if (isHTMLHtmlElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100149 // Check something before processing the open tag of HEAD element.
150 // First we add doc type declaration if original document has it.
151 if (!param->haveSeenDocType) {
152 param->haveSeenDocType = true;
153 result.append(createMarkup(param->document->doctype()));
154 }
155
156 // Add MOTW declaration before html tag.
157 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
158 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000159 } else if (isHTMLBaseElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100160 // Comment the BASE tag when serializing dom.
161 result.append("<!--");
162 }
163 } else {
164 // Write XML declaration.
165 if (!param->haveAddedXMLProcessingDirective) {
166 param->haveAddedXMLProcessingDirective = true;
167 // Get encoding info.
168 String xmlEncoding = param->document->xmlEncoding();
169 if (xmlEncoding.isEmpty())
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100170 xmlEncoding = param->document->encodingName();
Ben Murdoche69819b2013-07-17 14:56:49 +0100171 if (xmlEncoding.isEmpty())
172 xmlEncoding = UTF8Encoding().name();
173 result.append("<?xml version=\"");
174 result.append(param->document->xmlVersion());
175 result.append("\" encoding=\"");
176 result.append(xmlEncoding);
177 if (param->document->xmlStandalone())
178 result.append("\" standalone=\"yes");
179 result.append("\"?>\n");
180 }
181 // Add doc type declaration if original document has it.
182 if (!param->haveSeenDocType) {
183 param->haveSeenDocType = true;
184 result.append(createMarkup(param->document->doctype()));
185 }
186 }
187 return result.toString();
188}
189
190String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
191 const Element* element, SerializeDomParam* param)
192{
193 StringBuilder result;
194
195 param->haveAddedContentsBeforeEnd = false;
196 if (!param->isHTMLDocument)
197 return result.toString();
198 // Check after processing the open tag of HEAD element
199 if (!param->haveAddedCharsetDeclaration
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000200 && isHTMLHeadElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100201 param->haveAddedCharsetDeclaration = true;
202 // Check meta element. WebKit only pre-parse the first 512 bytes
203 // of the document. If the whole <HEAD> is larger and meta is the
204 // end of head part, then this kind of pages aren't decoded correctly
205 // because of this issue. So when we serialize the DOM, we need to
206 // make sure the meta will in first child of head tag.
207 // See http://bugs.webkit.org/show_bug.cgi?id=16621.
208 // First we generate new content for writing correct META element.
209 result.append(WebPageSerializer::generateMetaCharsetDeclaration(
210 String(param->textEncoding.name())));
211
212 param->haveAddedContentsBeforeEnd = true;
213 // Will search each META which has charset declaration, and skip them all
214 // in PreActionBeforeSerializeOpenTag.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000215 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100216 param->isInScriptOrStyleTag = true;
217 }
218
219 return result.toString();
220}
221
222String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
223 const Element* element, SerializeDomParam* param, bool* needSkip)
224{
225 String result;
226
227 *needSkip = false;
228 if (!param->isHTMLDocument)
229 return result;
230 // Skip the end tag of original META tag which declare charset.
231 // Need not to check whether it's META tag since we guarantee
232 // skipMetaElement is definitely META tag if it's not 0.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000233 if (param->skipMetaElement == element) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100234 *needSkip = true;
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000235 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100236 ASSERT(param->isInScriptOrStyleTag);
237 param->isInScriptOrStyleTag = false;
238 }
239
240 return result;
241}
242
243// After we finish serializing end tag of a element, we give the target
244// element a chance to do some post work to add some additional data.
245String WebPageSerializerImpl::postActionAfterSerializeEndTag(
246 const Element* element, SerializeDomParam* param)
247{
248 StringBuilder result;
249
250 if (!param->isHTMLDocument)
251 return result.toString();
252 // Comment the BASE tag when serializing DOM.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000253 if (isHTMLBaseElement(*element)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100254 result.append("-->");
255 // Append a new base tag declaration.
256 result.append(WebPageSerializer::generateBaseTagDeclaration(
257 param->document->baseTarget()));
258 }
259
260 return result.toString();
261}
262
263void WebPageSerializerImpl::saveHTMLContentToBuffer(
264 const String& result, SerializeDomParam* param)
265{
266 m_dataBuffer.append(result);
267 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
268 param,
269 DoNotForceFlush);
270}
271
272void WebPageSerializerImpl::encodeAndFlushBuffer(
273 WebPageSerializerClient::PageSerializationStatus status,
274 SerializeDomParam* param,
275 FlushOption flushOption)
276{
277 // Data buffer is not full nor do we want to force flush.
278 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
279 return;
280
281 String content = m_dataBuffer.toString();
282 m_dataBuffer.clear();
283
Ben Murdoch02772c62013-07-26 10:21:05 +0100284 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
Ben Murdoche69819b2013-07-17 14:56:49 +0100285
286 // Send result to the client.
287 m_client->didSerializeDataForFrame(param->url,
288 WebCString(encodedContent.data(), encodedContent.length()),
289 status);
290}
291
292void WebPageSerializerImpl::openTagToString(Element* element,
293 SerializeDomParam* param)
294{
295 bool needSkip;
296 StringBuilder result;
297 // Do pre action for open tag.
298 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
299 if (needSkip)
300 return;
301 // Add open tag
302 result.append('<');
303 result.append(element->nodeName().lower());
304 // Go through all attributes and serialize them.
305 if (element->hasAttributes()) {
306 unsigned numAttrs = element->attributeCount();
307 for (unsigned i = 0; i < numAttrs; i++) {
308 result.append(' ');
309 // Add attribute pair
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000310 const Attribute& attribute = element->attributeItem(i);
311 result.append(attribute.name().toString());
Ben Murdoche69819b2013-07-17 14:56:49 +0100312 result.appendLiteral("=\"");
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000313 if (!attribute.value().isEmpty()) {
314 const String& attrValue = attribute.value();
Ben Murdoche69819b2013-07-17 14:56:49 +0100315
316 // Check whether we need to replace some resource links
317 // with local resource paths.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000318 const QualifiedName& attrName = attribute.name();
Torne (Richard Coles)43e75022014-03-21 14:26:12 +0000319 if (element->hasLegalLinkAttribute(attrName)) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100320 // For links start with "javascript:", we do not change it.
321 if (attrValue.startsWith("javascript:", false))
322 result.append(attrValue);
323 else {
324 // Get the absolute link
325 WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element);
Ben Murdoch02772c62013-07-26 10:21:05 +0100326 String completeURL = subFrame ? subFrame->frame()->document()->url() :
Ben Murdoche69819b2013-07-17 14:56:49 +0100327 param->document->completeURL(attrValue);
328 // Check whether we have local files for those link.
329 if (m_localLinks.contains(completeURL)) {
330 if (!param->directoryName.isEmpty()) {
331 result.appendLiteral("./");
332 result.append(param->directoryName);
333 result.append('/');
334 }
335 result.append(m_localLinks.get(completeURL));
336 } else
337 result.append(completeURL);
338 }
339 } else {
340 if (param->isHTMLDocument)
341 result.append(m_htmlEntities.convertEntitiesInString(attrValue));
342 else
343 result.append(m_xmlEntities.convertEntitiesInString(attrValue));
344 }
345 }
346 result.append('\"');
347 }
348 }
349
350 // Do post action for open tag.
351 String addedContents = postActionAfterSerializeOpenTag(element, param);
352 // Complete the open tag for element when it has child/children.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000353 if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
Ben Murdoche69819b2013-07-17 14:56:49 +0100354 result.append('>');
355 // Append the added contents generate in post action of open tag.
356 result.append(addedContents);
357 // Save the result to data buffer.
358 saveHTMLContentToBuffer(result.toString(), param);
359}
360
361// Serialize end tag of an specified element.
362void WebPageSerializerImpl::endTagToString(Element* element,
363 SerializeDomParam* param)
364{
365 bool needSkip;
366 StringBuilder result;
367 // Do pre action for end tag.
368 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
369 if (needSkip)
370 return;
371 // Write end tag when element has child/children.
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000372 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
Ben Murdoche69819b2013-07-17 14:56:49 +0100373 result.appendLiteral("</");
374 result.append(element->nodeName().lower());
375 result.append('>');
376 } else {
377 // Check whether we have to write end tag for empty element.
378 if (param->isHTMLDocument) {
379 result.append('>');
380 // FIXME: This code is horribly wrong. WebPageSerializerImpl must die.
381 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
382 // We need to write end tag when it is required.
383 result.appendLiteral("</");
384 result.append(element->nodeName().lower());
385 result.append('>');
386 }
387 } else {
388 // For xml base document.
389 result.appendLiteral(" />");
390 }
391 }
392 // Do post action for end tag.
393 result.append(postActionAfterSerializeEndTag(element, param));
394 // Save the result to data buffer.
395 saveHTMLContentToBuffer(result.toString(), param);
396}
397
398void WebPageSerializerImpl::buildContentForNode(Node* node,
399 SerializeDomParam* param)
400{
401 switch (node->nodeType()) {
402 case Node::ELEMENT_NODE:
403 // Process open tag of element.
404 openTagToString(toElement(node), param);
405 // Walk through the children nodes and process it.
406 for (Node *child = node->firstChild(); child; child = child->nextSibling())
407 buildContentForNode(child, param);
408 // Process end tag of element.
409 endTagToString(toElement(node), param);
410 break;
411 case Node::TEXT_NODE:
412 saveHTMLContentToBuffer(createMarkup(node), param);
413 break;
414 case Node::ATTRIBUTE_NODE:
415 case Node::DOCUMENT_NODE:
416 case Node::DOCUMENT_FRAGMENT_NODE:
417 // Should not exist.
418 ASSERT_NOT_REACHED();
419 break;
420 // Document type node can be in DOM?
421 case Node::DOCUMENT_TYPE_NODE:
422 param->haveSeenDocType = true;
423 default:
424 // For other type node, call default action.
425 saveHTMLContentToBuffer(createMarkup(node), param);
426 break;
427 }
428}
429
430WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
431 bool recursiveSerialization,
432 WebPageSerializerClient* client,
433 const WebVector<WebURL>& links,
434 const WebVector<WebString>& localPaths,
435 const WebString& localDirectoryName)
436 : m_client(client)
437 , m_recursiveSerialization(recursiveSerialization)
438 , m_framesCollected(false)
439 , m_localDirectoryName(localDirectoryName)
440 , m_htmlEntities(false)
441 , m_xmlEntities(true)
442{
443 // Must specify available webframe.
444 ASSERT(frame);
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100445 m_specifiedWebFrameImpl = toWebFrameImpl(frame);
Ben Murdoche69819b2013-07-17 14:56:49 +0100446 // Make sure we have non 0 client.
447 ASSERT(client);
448 // Build local resources map.
449 ASSERT(links.size() == localPaths.size());
450 for (size_t i = 0; i < links.size(); i++) {
451 KURL url = links[i];
452 ASSERT(!m_localLinks.contains(url.string()));
453 m_localLinks.set(url.string(), localPaths[i]);
454 }
455
456 ASSERT(m_dataBuffer.isEmpty());
457}
458
459void WebPageSerializerImpl::collectTargetFrames()
460{
461 ASSERT(!m_framesCollected);
462 m_framesCollected = true;
463
464 // First, process main frame.
465 m_frames.append(m_specifiedWebFrameImpl);
466 // Return now if user only needs to serialize specified frame, not including
467 // all sub-frames.
468 if (!m_recursiveSerialization)
469 return;
470 // Collect all frames inside the specified frame.
471 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
472 WebFrameImpl* currentFrame = m_frames[i];
473 // Get current using document.
474 Document* currentDoc = currentFrame->frame()->document();
475 // Go through sub-frames.
476 RefPtr<HTMLCollection> all = currentDoc->all();
477
Torne (Richard Coles)09380292014-02-21 12:17:33 +0000478 for (unsigned i = 0; Element* element = all->item(i); i++) {
479 if (!element->isHTMLElement())
Ben Murdoche69819b2013-07-17 14:56:49 +0100480 continue;
Ben Murdoche69819b2013-07-17 14:56:49 +0100481 WebFrameImpl* webFrame =
482 WebFrameImpl::fromFrameOwnerElement(element);
483 if (webFrame)
484 m_frames.append(webFrame);
485 }
486 }
487}
488
489bool WebPageSerializerImpl::serialize()
490{
491 if (!m_framesCollected)
492 collectTargetFrames();
493
494 bool didSerialization = false;
495 KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url();
496
497 for (unsigned i = 0; i < m_frames.size(); ++i) {
498 WebFrameImpl* webFrame = m_frames[i];
499 Document* document = webFrame->frame()->document();
500 const KURL& url = document->url();
501
502 if (!url.isValid() || !m_localLinks.contains(url.string()))
503 continue;
504
505 didSerialization = true;
506
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100507 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
Ben Murdoche69819b2013-07-17 14:56:49 +0100508 String directoryName = url == mainURL ? m_localDirectoryName : "";
509
510 SerializeDomParam param(url, textEncoding, document, directoryName);
511
512 Element* documentElement = document->documentElement();
513 if (documentElement)
514 buildContentForNode(documentElement, &param);
515
516 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
517 }
518
519 ASSERT(m_dataBuffer.isEmpty());
520 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
521 return didSerialization;
522}
523
Torne (Richard Coles)51b29062013-11-28 11:56:03 +0000524} // namespace blink