/* * Copyright (C) 2009 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "WebPageSerializer.h" #include "DocumentLoader.h" #include "Element.h" #include "Frame.h" #include "HTMLAllCollection.h" #include "HTMLFrameOwnerElement.h" #include "HTMLInputElement.h" #include "HTMLNames.h" #include "KURL.h" #include "Vector.h" #include "WebCString.h" #include "WebFrame.h" #include "WebFrameImpl.h" #include "WebPageSerializerClient.h" #include "WebPageSerializerImpl.h" #include "WebString.h" #include "WebURL.h" #include "WebVector.h" #include "WebView.h" #include <wtf/text/StringConcatenate.h> using namespace WebCore; namespace { KURL getSubResourceURLFromElement(Element* element) { ASSERT(element); const QualifiedName* attributeName = 0; if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) attributeName = &HTMLNames::srcAttr; else if (element->hasTagName(HTMLNames::inputTag)) { HTMLInputElement* input = static_cast<HTMLInputElement*>(element); if (input->isImageButton()) attributeName = &HTMLNames::srcAttr; } else if (element->hasTagName(HTMLNames::bodyTag) || element->hasTagName(HTMLNames::tableTag) || element->hasTagName(HTMLNames::trTag) || element->hasTagName(HTMLNames::tdTag)) attributeName = &HTMLNames::backgroundAttr; else if (element->hasTagName(HTMLNames::blockquoteTag) || element->hasTagName(HTMLNames::qTag) || element->hasTagName(HTMLNames::delTag) || element->hasTagName(HTMLNames::insTag)) attributeName = &HTMLNames::citeAttr; else if (element->hasTagName(HTMLNames::linkTag)) { // If the link element is not css, ignore it. if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { // FIXME: Add support for extracting links of sub-resources which // are inside style-sheet such as @import, @font-face, url(), etc. attributeName = &HTMLNames::hrefAttr; } } else if (element->hasTagName(HTMLNames::objectTag)) attributeName = &HTMLNames::dataAttr; else if (element->hasTagName(HTMLNames::embedTag)) attributeName = &HTMLNames::srcAttr; if (!attributeName) return KURL(); String value = element->getAttribute(*attributeName); // Ignore javascript content. if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) return KURL(); return element->document()->completeURL(value); } void retrieveResourcesForElement(Element* element, Vector<Frame*>* visitedFrames, Vector<Frame*>* framesToVisit, Vector<KURL>* frameURLs, Vector<KURL>* resourceURLs) { // If the node is a frame, we'll process it later in retrieveResourcesForFrame. if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) && element->isFrameOwnerElement()) { Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame(); if (frame) { if (!visitedFrames->contains(frame)) framesToVisit->append(frame); return; } } KURL url = getSubResourceURLFromElement(element); if (url.isEmpty() || !url.isValid()) return; // No subresource for this node. // Ignore URLs that have a non-standard protocols. Since the FTP protocol // does no have a cache mechanism, we skip it as well. if (!url.protocolInHTTPFamily() && !url.isLocalFile()) return; if (!resourceURLs->contains(url)) resourceURLs->append(url); } void retrieveResourcesForFrame(Frame* frame, const WebKit::WebVector<WebKit::WebCString>& supportedSchemes, Vector<Frame*>* visitedFrames, Vector<Frame*>* framesToVisit, Vector<KURL>* frameURLs, Vector<KURL>* resourceURLs) { KURL frameURL = frame->loader()->documentLoader()->request().url(); // If the frame's URL is invalid, ignore it, it is not retrievable. if (!frameURL.isValid()) return; // Ignore frames from unsupported schemes. bool isValidScheme = false; for (size_t i = 0; i < supportedSchemes.size(); ++i) { if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { isValidScheme = true; break; } } if (!isValidScheme) return; // If we have already seen that frame, ignore it. if (visitedFrames->contains(frame)) return; visitedFrames->append(frame); if (!frameURLs->contains(frameURL)) frameURLs->append(frameURL); // Now get the resources associated with each node of the document. RefPtr<HTMLAllCollection> allNodes = frame->document()->all(); for (unsigned i = 0; i < allNodes->length(); ++i) { Node* node = allNodes->item(i); // We are only interested in HTML resources. if (!node->isElementNode()) continue; retrieveResourcesForElement(static_cast<Element*>(node), visitedFrames, framesToVisit, frameURLs, resourceURLs); } } } // namespace namespace WebKit { bool WebPageSerializer::serialize(WebFrame* frame, bool recursive, WebPageSerializerClient* client, const WebVector<WebURL>& links, const WebVector<WebString>& localPaths, const WebString& localDirectoryName) { WebPageSerializerImpl serializerImpl( frame, recursive, client, links, localPaths, localDirectoryName); return serializerImpl.serialize(); } bool WebPageSerializer::retrieveAllResources(WebView* view, const WebVector<WebCString>& supportedSchemes, WebVector<WebURL>* resourceURLs, WebVector<WebURL>* frameURLs) { WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame()); if (!mainFrame) return false; Vector<Frame*> framesToVisit; Vector<Frame*> visitedFrames; Vector<KURL> frameKURLs; Vector<KURL> resourceKURLs; // Let's retrieve the resources from every frame in this page. framesToVisit.append(mainFrame->frame()); while (!framesToVisit.isEmpty()) { Frame* frame = framesToVisit[0]; framesToVisit.remove(0); retrieveResourcesForFrame(frame, supportedSchemes, &visitedFrames, &framesToVisit, &frameKURLs, &resourceKURLs); } // Converts the results to WebURLs. WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); for (size_t i = 0; i < resourceKURLs.size(); ++i) { resultResourceURLs[i] = resourceKURLs[i]; // A frame's src can point to the same URL as another resource, keep the // resource URL only in such cases. size_t index = frameKURLs.find(resourceKURLs[i]); if (index != notFound) frameKURLs.remove(index); } *resourceURLs = resultResourceURLs; WebVector<WebURL> resultFrameURLs(frameKURLs.size()); for (size_t i = 0; i < frameKURLs.size(); ++i) resultFrameURLs[i] = frameKURLs[i]; *frameURLs = resultFrameURLs; return true; } WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) { return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">"); } WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) { return String::format("\n<!-- saved from url=(%04d)%s -->\n", static_cast<int>(url.spec().length()), url.spec().data()); } WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) { if (baseTarget.isEmpty()) return makeString("<base href=\".\">"); return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">"); } } // namespace WebKit