// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/renderer/savable_resources.h" #include <set> #include "base/compiler_specific.h" #include "base/logging.h" #include "base/strings/string_util.h" #include "third_party/WebKit/public/platform/WebString.h" #include "third_party/WebKit/public/platform/WebVector.h" #include "third_party/WebKit/public/web/WebDocument.h" #include "third_party/WebKit/public/web/WebElement.h" #include "third_party/WebKit/public/web/WebFrame.h" #include "third_party/WebKit/public/web/WebInputElement.h" #include "third_party/WebKit/public/web/WebNode.h" #include "third_party/WebKit/public/web/WebNodeCollection.h" #include "third_party/WebKit/public/web/WebNodeList.h" #include "third_party/WebKit/public/web/WebView.h" using blink::WebDocument; using blink::WebElement; using blink::WebFrame; using blink::WebInputElement; using blink::WebNode; using blink::WebNodeCollection; using blink::WebNodeList; using blink::WebString; using blink::WebVector; using blink::WebView; namespace content { namespace { // Structure for storage the unique set of all savable resource links for // making sure that no duplicated resource link in final result. The consumer // of the SavableResourcesUniqueCheck is responsible for keeping these pointers // valid for the lifetime of the SavableResourcesUniqueCheck instance. struct SavableResourcesUniqueCheck { // Unique set of all sub resource links. std::set<GURL>* resources_set; // Unique set of all frame links. std::set<GURL>* frames_set; // Collection of all frames we go through when getting all savable resource // links. std::vector<WebFrame*>* frames; SavableResourcesUniqueCheck() : resources_set(NULL), frames_set(NULL), frames(NULL) {} SavableResourcesUniqueCheck(std::set<GURL>* resources_set, std::set<GURL>* frames_set, std::vector<WebFrame*>* frames) : resources_set(resources_set), frames_set(frames_set), frames(frames) {} }; // Get all savable resource links from current element. One element might // have more than one resource link. It is possible to have some links // in one CSS stylesheet. void GetSavableResourceLinkForElement( const WebElement& element, const WebDocument& current_doc, SavableResourcesUniqueCheck* unique_check, SavableResourcesResult* result) { // Handle frame and iframe tag. if (element.hasTagName("iframe") || element.hasTagName("frame")) { WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element); if (sub_frame) unique_check->frames->push_back(sub_frame); return; } // Check whether the node has sub resource URL or not. WebString value = GetSubResourceLinkFromElement(element); if (value.isNull()) return; // Get absolute URL. GURL u = current_doc.completeURL(value); // ignore invalid URL if (!u.is_valid()) return; // Ignore those URLs which are not standard protocols. Because FTP // protocol does no have cache mechanism, we will skip all // sub-resources if they use FTP protocol. if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file")) return; // Ignore duplicated resource link. if (!unique_check->resources_set->insert(u).second) return; result->resources_list->push_back(u); // Insert referrer for above new resource link. result->referrer_urls_list->push_back(GURL()); result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault); } // Get all savable resource links from current WebFrameImpl object pointer. void GetAllSavableResourceLinksForFrame(WebFrame* current_frame, SavableResourcesUniqueCheck* unique_check, SavableResourcesResult* result, const char** savable_schemes) { // Get current frame's URL. GURL current_frame_url = current_frame->document().url(); // If url of current frame is invalid, ignore it. if (!current_frame_url.is_valid()) return; // If url of current frame is not a savable protocol, ignore it. bool is_valid_protocol = false; for (int i = 0; savable_schemes[i] != NULL; ++i) { if (current_frame_url.SchemeIs(savable_schemes[i])) { is_valid_protocol = true; break; } } if (!is_valid_protocol) return; // If find same frame we have recorded, ignore it. if (!unique_check->frames_set->insert(current_frame_url).second) return; // Get current using document. WebDocument current_doc = current_frame->document(); // Go through all descent nodes. WebNodeCollection all = current_doc.all(); // Go through all node in this frame. for (WebNode node = all.firstItem(); !node.isNull(); node = all.nextItem()) { // We only save HTML resources. if (!node.isElementNode()) continue; WebElement element = node.to<WebElement>(); GetSavableResourceLinkForElement(element, current_doc, unique_check, result); } } } // namespace WebString GetSubResourceLinkFromElement(const WebElement& element) { const char* attribute_name = NULL; if (element.hasHTMLTagName("img") || element.hasHTMLTagName("script")) { attribute_name = "src"; } else if (element.hasHTMLTagName("input")) { const WebInputElement input = element.toConst<WebInputElement>(); if (input.isImageButton()) { attribute_name = "src"; } } else if (element.hasHTMLTagName("body") || element.hasHTMLTagName("table") || element.hasHTMLTagName("tr") || element.hasHTMLTagName("td")) { attribute_name = "background"; } else if (element.hasHTMLTagName("blockquote") || element.hasHTMLTagName("q") || element.hasHTMLTagName("del") || element.hasHTMLTagName("ins")) { attribute_name = "cite"; } else if (element.hasHTMLTagName("link")) { // If the link element is not linked to css, ignore it. if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) { // TODO(jnd): Add support for extracting links of sub-resources which // are inside style-sheet such as @import, url(), etc. // See bug: http://b/issue?id=1111667. attribute_name = "href"; } } if (!attribute_name) return WebString(); WebString value = element.getAttribute(WebString::fromUTF8(attribute_name)); // If value has content and not start with "javascript:" then return it, // otherwise return NULL. if (!value.isNull() && !value.isEmpty() && !StartsWithASCII(value.utf8(), "javascript:", false)) return value; return WebString(); } // Get all savable resource links from current webview, include main // frame and sub-frame bool GetAllSavableResourceLinksForCurrentPage(WebView* view, const GURL& page_url, SavableResourcesResult* result, const char** savable_schemes) { WebFrame* main_frame = view->mainFrame(); if (!main_frame) return false; std::set<GURL> resources_set; std::set<GURL> frames_set; std::vector<WebFrame*> frames; SavableResourcesUniqueCheck unique_check(&resources_set, &frames_set, &frames); GURL main_page_gurl(main_frame->document().url()); // Make sure we are saving same page between embedder and webkit. // If page has being navigated, embedder will get three empty vector, // which will make the saving page job ended. if (page_url != main_page_gurl) return true; // First, process main frame. frames.push_back(main_frame); // Check all resource in this page, include sub-frame. for (int i = 0; i < static_cast<int>(frames.size()); ++i) { // Get current frame's all savable resource links. GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result, savable_schemes); } // Since frame's src can also point to sub-resources link, so it is possible // that some URLs in frames_list are also in resources_list. For those // URLs, we will remove it from frame_list, only keep them in resources_list. for (std::set<GURL>::iterator it = frames_set.begin(); it != frames_set.end(); ++it) { // Append unique frame source to savable frame list. if (resources_set.find(*it) == resources_set.end()) result->frames_list->push_back(*it); } return true; } } // namespace content