blob: f3ac50c8d3f63c41048c3d812ec2921fd826fc0d [file] [log] [blame]
Ben Murdoche69819b2013-07-17 14:56:49 +01001/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "WebPageSerializer.h"
33
34#include "HTMLNames.h"
35#include "WebFrame.h"
36#include "WebFrameImpl.h"
37#include "WebPageSerializerClient.h"
38#include "WebPageSerializerImpl.h"
39#include "WebView.h"
40#include "WebViewImpl.h"
41#include "core/dom/Document.h"
42#include "core/dom/Element.h"
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +000043#include "core/frame/LocalFrame.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010044#include "core/html/HTMLAllCollection.h"
Torne (Richard Coles)43e75022014-03-21 14:26:12 +000045#include "core/html/HTMLFrameElementBase.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010046#include "core/html/HTMLFrameOwnerElement.h"
47#include "core/html/HTMLInputElement.h"
48#include "core/html/HTMLTableElement.h"
49#include "core/loader/DocumentLoader.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010050#include "core/page/PageSerializer.h"
Torne (Richard Coles)bfe35902013-10-22 16:41:51 +010051#include "platform/SerializedResource.h"
Torne (Richard Coles)51b29062013-11-28 11:56:03 +000052#include "platform/mhtml/MHTMLArchive.h"
53#include "platform/weborigin/KURL.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010054#include "public/platform/WebCString.h"
55#include "public/platform/WebString.h"
56#include "public/platform/WebURL.h"
57#include "public/platform/WebVector.h"
Ben Murdoche69819b2013-07-17 14:56:49 +010058#include "wtf/Vector.h"
59#include "wtf/text/StringConcatenate.h"
60
61using namespace WebCore;
62
63namespace {
64
65KURL getSubResourceURLFromElement(Element* element)
66{
67 ASSERT(element);
Torne (Richard Coles)43e75022014-03-21 14:26:12 +000068 const QualifiedName& attributeName = element->subResourceAttributeName();
69 if (attributeName == nullQName())
Ben Murdoche69819b2013-07-17 14:56:49 +010070 return KURL();
71
Torne (Richard Coles)43e75022014-03-21 14:26:12 +000072 String value = element->getAttribute(attributeName);
Ben Murdoche69819b2013-07-17 14:56:49 +010073 // Ignore javascript content.
74 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
75 return KURL();
Ben Murdoch02772c62013-07-26 10:21:05 +010076
Torne (Richard Coles)8abfc582013-09-12 12:10:38 +010077 return element->document().completeURL(value);
Ben Murdoche69819b2013-07-17 14:56:49 +010078}
79
80void retrieveResourcesForElement(Element* element,
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +000081 Vector<LocalFrame*>* visitedFrames,
82 Vector<LocalFrame*>* framesToVisit,
Ben Murdoche69819b2013-07-17 14:56:49 +010083 Vector<KURL>* frameURLs,
84 Vector<KURL>* resourceURLs)
85{
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +000086 ASSERT(element);
Ben Murdoche69819b2013-07-17 14:56:49 +010087 // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
Torne (Richard Coles)43e75022014-03-21 14:26:12 +000088 if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) {
Ben Murdoch07a852d2014-03-31 11:51:52 +010089 Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
90 if (frame && frame->isLocalFrame()) {
91 if (!visitedFrames->contains(toLocalFrame(frame)))
92 framesToVisit->append(toLocalFrame(frame));
Ben Murdoche69819b2013-07-17 14:56:49 +010093 return;
94 }
95 }
96
97 KURL url = getSubResourceURLFromElement(element);
98 if (url.isEmpty() || !url.isValid())
99 return; // No subresource for this node.
100
101 // Ignore URLs that have a non-standard protocols. Since the FTP protocol
102 // does no have a cache mechanism, we skip it as well.
103 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
104 return;
105
106 if (!resourceURLs->contains(url))
107 resourceURLs->append(url);
108}
109
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000110void retrieveResourcesForFrame(LocalFrame* frame,
Torne (Richard Coles)51b29062013-11-28 11:56:03 +0000111 const blink::WebVector<blink::WebCString>& supportedSchemes,
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000112 Vector<LocalFrame*>* visitedFrames,
113 Vector<LocalFrame*>* framesToVisit,
Ben Murdoche69819b2013-07-17 14:56:49 +0100114 Vector<KURL>* frameURLs,
115 Vector<KURL>* resourceURLs)
116{
Torne (Richard Coles)f79f16f2013-10-31 11:16:44 +0000117 KURL frameURL = frame->loader().documentLoader()->request().url();
Ben Murdoche69819b2013-07-17 14:56:49 +0100118
119 // If the frame's URL is invalid, ignore it, it is not retrievable.
120 if (!frameURL.isValid())
121 return;
122
123 // Ignore frames from unsupported schemes.
124 bool isValidScheme = false;
125 for (size_t i = 0; i < supportedSchemes.size(); ++i) {
126 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
127 isValidScheme = true;
128 break;
129 }
130 }
131 if (!isValidScheme)
132 return;
133
134 // If we have already seen that frame, ignore it.
135 if (visitedFrames->contains(frame))
136 return;
137 visitedFrames->append(frame);
138 if (!frameURLs->contains(frameURL))
139 frameURLs->append(frameURL);
Ben Murdoch02772c62013-07-26 10:21:05 +0100140
Ben Murdoche69819b2013-07-17 14:56:49 +0100141 // Now get the resources associated with each node of the document.
Torne (Richard Coles)09380292014-02-21 12:17:33 +0000142 RefPtr<HTMLCollection> allElements = frame->document()->all();
143 for (unsigned i = 0; i < allElements->length(); ++i) {
144 Element* element = allElements->item(i);
145 retrieveResourcesForElement(element,
Ben Murdoche69819b2013-07-17 14:56:49 +0100146 visitedFrames, framesToVisit,
147 frameURLs, resourceURLs);
148 }
149}
150
151} // namespace
152
Torne (Richard Coles)51b29062013-11-28 11:56:03 +0000153namespace blink {
Ben Murdoche69819b2013-07-17 14:56:49 +0100154
155void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
156{
157 Vector<SerializedResource> resources;
158 PageSerializer serializer(&resources);
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100159 serializer.serialize(toWebViewImpl(view)->page());
Ben Murdoche69819b2013-07-17 14:56:49 +0100160
161 Vector<Resource> result;
162 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
163 Resource resource;
164 resource.url = iter->url;
165 resource.mimeType = iter->mimeType.ascii();
166 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
167 resource.data = WebCString(iter->data->data(), iter->data->size());
168 result.append(resource);
169 }
170
Ben Murdoch02772c62013-07-26 10:21:05 +0100171 *resourcesParam = result;
Ben Murdoche69819b2013-07-17 14:56:49 +0100172}
173
174static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
175{
176 Vector<SerializedResource> resources;
177 PageSerializer serializer(&resources);
178 serializer.serialize(page);
179 Document* document = page->mainFrame()->document();
180 return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
181}
182
183WebCString WebPageSerializer::serializeToMHTML(WebView* view)
184{
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100185 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding);
Ben Murdoche69819b2013-07-17 14:56:49 +0100186 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
187 return WebCString(mhtml->data(), mhtml->size());
188}
189
190WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
191{
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100192 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding);
Ben Murdoche69819b2013-07-17 14:56:49 +0100193 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
194 return WebCString(mhtml->data(), mhtml->size());
195}
196
197bool WebPageSerializer::serialize(WebFrame* frame,
198 bool recursive,
199 WebPageSerializerClient* client,
200 const WebVector<WebURL>& links,
201 const WebVector<WebString>& localPaths,
202 const WebString& localDirectoryName)
203{
204 WebPageSerializerImpl serializerImpl(
205 frame, recursive, client, links, localPaths, localDirectoryName);
206 return serializerImpl.serialize();
207}
208
209bool WebPageSerializer::retrieveAllResources(WebView* view,
210 const WebVector<WebCString>& supportedSchemes,
211 WebVector<WebURL>* resourceURLs,
212 WebVector<WebURL>* frameURLs) {
Torne (Richard Coles)c0e19a62013-08-30 15:15:11 +0100213 WebFrameImpl* mainFrame = toWebFrameImpl(view->mainFrame());
Ben Murdoche69819b2013-07-17 14:56:49 +0100214 if (!mainFrame)
215 return false;
216
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000217 Vector<LocalFrame*> framesToVisit;
218 Vector<LocalFrame*> visitedFrames;
Ben Murdoche69819b2013-07-17 14:56:49 +0100219 Vector<KURL> frameKURLs;
220 Vector<KURL> resourceKURLs;
Ben Murdoch02772c62013-07-26 10:21:05 +0100221
Ben Murdoche69819b2013-07-17 14:56:49 +0100222 // Let's retrieve the resources from every frame in this page.
223 framesToVisit.append(mainFrame->frame());
224 while (!framesToVisit.isEmpty()) {
Torne (Richard Coles)d5428f32014-03-18 10:21:16 +0000225 LocalFrame* frame = framesToVisit[0];
Ben Murdoche69819b2013-07-17 14:56:49 +0100226 framesToVisit.remove(0);
227 retrieveResourcesForFrame(frame, supportedSchemes,
228 &visitedFrames, &framesToVisit,
229 &frameKURLs, &resourceKURLs);
230 }
231
232 // Converts the results to WebURLs.
233 WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
234 for (size_t i = 0; i < resourceKURLs.size(); ++i) {
235 resultResourceURLs[i] = resourceKURLs[i];
236 // A frame's src can point to the same URL as another resource, keep the
237 // resource URL only in such cases.
238 size_t index = frameKURLs.find(resourceKURLs[i]);
Torne (Richard Coles)06f816c2013-09-26 13:25:12 +0100239 if (index != kNotFound)
Ben Murdoche69819b2013-07-17 14:56:49 +0100240 frameKURLs.remove(index);
241 }
242 *resourceURLs = resultResourceURLs;
243 WebVector<WebURL> resultFrameURLs(frameKURLs.size());
244 for (size_t i = 0; i < frameKURLs.size(); ++i)
245 resultFrameURLs[i] = frameKURLs[i];
246 *frameURLs = resultFrameURLs;
Ben Murdoch02772c62013-07-26 10:21:05 +0100247
Ben Murdoche69819b2013-07-17 14:56:49 +0100248 return true;
249}
250
251WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
252{
253 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
254 return charsetString;
255}
256
257WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
258{
259 return String::format("\n<!-- saved from url=(%04d)%s -->\n",
260 static_cast<int>(url.spec().length()),
261 url.spec().data());
262}
263
264WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
265{
266 if (baseTarget.isEmpty())
Ben Murdoch3c9e4ae2013-08-12 14:20:44 +0100267 return String("<base href=\".\">");
Ben Murdoche69819b2013-07-17 14:56:49 +0100268 String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
269 return baseString;
270}
271
Torne (Richard Coles)51b29062013-11-28 11:56:03 +0000272} // namespace blink