/* * Copyright (C) 2009 Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // How we handle the base tag better. // Current status: // At now the normal way we use to handling base tag is // a) For those links which have corresponding local saved files, such as // savable CSS, JavaScript files, they will be written to relative URLs which // point to local saved file. Why those links can not be resolved as absolute // file URLs, because if they are resolved as absolute URLs, after moving the // file location from one directory to another directory, the file URLs will // be dead links. // b) For those links which have not corresponding local saved files, such as // links in A, AREA tags, they will be resolved as absolute URLs. // c) We comment all base tags when serialzing DOM for the page. // FireFox also uses above way to handle base tag. // // Problem: // This way can not handle the following situation: // the base tag is written by JavaScript. // For example. The page "www.yahoo.com" use // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL // of page when loading page. So when saving page as completed-HTML, we assume // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved // completed-HTML page, then the JavaScript will insert a base tag // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to // local saved resource files will be resolved as // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource // files can not be loaded correctly. Also the page will be rendered ugly since // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame // files can not be fetched. // Now FireFox, IE and WebKit based Browser all have this problem. // // Solution: // My solution is that we comment old base tag and write new base tag: // <base href="." ...> after the previous commented base tag. In WebKit, it // always uses the latest "href" attribute of base tag to set document's base // URL. Based on this behavior, when we encounter a base tag, we comment it and // write a new base tag <base href="."> after the previous commented base tag. // The new added base tag can help engine to locate correct base URL for // correctly loading local saved resource files. Also I think we need to inherit // the base target value from document object when appending new base tag. // If there are multiple base tags in original document, we will comment all old // base tags and append new base tag after each old base tag because we do not // know those old base tags are original content or added by JavaScript. If // they are added by JavaScript, it means when loading saved page, the script(s) // will still insert base tag(s) to DOM, so the new added base tag(s) can // override the incorrect base URL and make sure we alway load correct local // saved resource files. #include "config.h" #include "WebPageSerializerImpl.h" #include "Document.h" #include "DocumentLoader.h" #include "DocumentType.h" #include "Element.h" #include "FrameLoader.h" #include "HTMLAllCollection.h" #include "HTMLElement.h" #include "HTMLFormElement.h" #include "HTMLMetaElement.h" #include "HTMLNames.h" #include "KURL.h" #include "TextEncoding.h" #include "markup.h" #include "DOMUtilitiesPrivate.h" #include "WebFrameImpl.h" #include "WebURL.h" #include "WebVector.h" using namespace WebCore; namespace WebKit { // Maximum length of data buffer which is used to temporary save generated // html content data. This is a soft limit which might be passed if a very large // contegious string is found in the page. static const unsigned dataBufferCapacity = 65536; WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, const TextEncoding& textEncoding, Document* document, const String& directoryName) : url(url) , textEncoding(textEncoding) , document(document) , directoryName(directoryName) , isHTMLDocument(document->isHTMLDocument()) , haveSeenDocType(false) , haveAddedCharsetDeclaration(false) , skipMetaElement(0) , isInScriptOrStyleTag(false) , haveAddedXMLProcessingDirective(false) , haveAddedContentsBeforeEnd(false) { } String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( const Element* element, SerializeDomParam* param, bool* needSkip) { StringBuilder result; *needSkip = false; if (param->isHTMLDocument) { // Skip the open tag of original META tag which declare charset since we // have overrided the META which have correct charset declaration after // serializing open tag of HEAD element. if (element->hasTagName(HTMLNames::metaTag)) { const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element); // Check whether the META tag has declared charset or not. String equiv = meta->httpEquiv(); if (equalIgnoringCase(equiv, "content-type")) { String content = meta->content(); if (content.length() && content.contains("charset", false)) { // Find META tag declared charset, we need to skip it when // serializing DOM. param->skipMetaElement = element; *needSkip = true; } } } else if (element->hasTagName(HTMLNames::htmlTag)) { // Check something before processing the open tag of HEAD element. // First we add doc type declaration if original document has it. if (!param->haveSeenDocType) { param->haveSeenDocType = true; result.append(createMarkup(param->document->doctype())); } // Add MOTW declaration before html tag. // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); } else if (element->hasTagName(HTMLNames::baseTag)) { // Comment the BASE tag when serializing dom. result.append("<!--"); } } else { // Write XML declaration. if (!param->haveAddedXMLProcessingDirective) { param->haveAddedXMLProcessingDirective = true; // Get encoding info. String xmlEncoding = param->document->xmlEncoding(); if (xmlEncoding.isEmpty()) xmlEncoding = param->document->loader()->writer()->encoding(); if (xmlEncoding.isEmpty()) xmlEncoding = UTF8Encoding().name(); result.append("<?xml version=\""); result.append(param->document->xmlVersion()); result.append("\" encoding=\""); result.append(xmlEncoding); if (param->document->xmlStandalone()) result.append("\" standalone=\"yes"); result.append("\"?>\n"); } // Add doc type declaration if original document has it. if (!param->haveSeenDocType) { param->haveSeenDocType = true; result.append(createMarkup(param->document->doctype())); } } return result.toString(); } String WebPageSerializerImpl::postActionAfterSerializeOpenTag( const Element* element, SerializeDomParam* param) { StringBuilder result; param->haveAddedContentsBeforeEnd = false; if (!param->isHTMLDocument) return result.toString(); // Check after processing the open tag of HEAD element if (!param->haveAddedCharsetDeclaration && element->hasTagName(HTMLNames::headTag)) { param->haveAddedCharsetDeclaration = true; // Check meta element. WebKit only pre-parse the first 512 bytes // of the document. If the whole <HEAD> is larger and meta is the // end of head part, then this kind of pages aren't decoded correctly // because of this issue. So when we serialize the DOM, we need to // make sure the meta will in first child of head tag. // See http://bugs.webkit.org/show_bug.cgi?id=16621. // First we generate new content for writing correct META element. result.append(WebPageSerializer::generateMetaCharsetDeclaration( String(param->textEncoding.name()))); param->haveAddedContentsBeforeEnd = true; // Will search each META which has charset declaration, and skip them all // in PreActionBeforeSerializeOpenTag. } else if (element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::styleTag)) { param->isInScriptOrStyleTag = true; } return result.toString(); } String WebPageSerializerImpl::preActionBeforeSerializeEndTag( const Element* element, SerializeDomParam* param, bool* needSkip) { String result; *needSkip = false; if (!param->isHTMLDocument) return result; // Skip the end tag of original META tag which declare charset. // Need not to check whether it's META tag since we guarantee // skipMetaElement is definitely META tag if it's not 0. if (param->skipMetaElement == element) *needSkip = true; else if (element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::styleTag)) { ASSERT(param->isInScriptOrStyleTag); param->isInScriptOrStyleTag = false; } return result; } // After we finish serializing end tag of a element, we give the target // element a chance to do some post work to add some additional data. String WebPageSerializerImpl::postActionAfterSerializeEndTag( const Element* element, SerializeDomParam* param) { StringBuilder result; if (!param->isHTMLDocument) return result.toString(); // Comment the BASE tag when serializing DOM. if (element->hasTagName(HTMLNames::baseTag)) { result.append("-->"); // Append a new base tag declaration. result.append(WebPageSerializer::generateBaseTagDeclaration( param->document->baseTarget())); } return result.toString(); } void WebPageSerializerImpl::saveHTMLContentToBuffer( const String& result, SerializeDomParam* param) { m_dataBuffer.append(result); encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, param, DoNotForceFlush); } void WebPageSerializerImpl::encodeAndFlushBuffer( WebPageSerializerClient::PageSerializationStatus status, SerializeDomParam* param, FlushOption flushOption) { // Data buffer is not full nor do we want to force flush. if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) return; String content = m_dataBuffer.toString(); m_dataBuffer = StringBuilder(); // Convert the unicode content to target encoding CString encodedContent = param->textEncoding.encode( content.characters(), content.length(), EntitiesForUnencodables); // Send result to the client. m_client->didSerializeDataForFrame(param->url, WebCString(encodedContent.data(), encodedContent.length()), status); } void WebPageSerializerImpl::openTagToString(Element* element, SerializeDomParam* param) { // FIXME: use StringBuilder instead of String. bool needSkip; // Do pre action for open tag. String result = preActionBeforeSerializeOpenTag(element, param, &needSkip); if (needSkip) return; // Add open tag result += "<" + element->nodeName().lower(); // Go through all attributes and serialize them. const NamedNodeMap *attrMap = element->attributes(true); if (attrMap) { unsigned numAttrs = attrMap->length(); for (unsigned i = 0; i < numAttrs; i++) { result += " "; // Add attribute pair const Attribute *attribute = attrMap->attributeItem(i); result += attribute->name().toString(); result += "=\""; if (!attribute->value().isEmpty()) { const String& attrValue = attribute->value(); // Check whether we need to replace some resource links // with local resource paths. const QualifiedName& attrName = attribute->name(); if (elementHasLegalLinkAttribute(element, attrName)) { // For links start with "javascript:", we do not change it. if (attrValue.startsWith("javascript:", false)) result += attrValue; else { // Get the absolute link WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element); String completeURL = subFrame ? subFrame->frame()->document()->url() : param->document->completeURL(attrValue); // Check whether we have local files for those link. if (m_localLinks.contains(completeURL)) { if (!param->directoryName.isEmpty()) result += "./" + param->directoryName + "/"; result += m_localLinks.get(completeURL); } else result += completeURL; } } else { if (param->isHTMLDocument) result += m_htmlEntities.convertEntitiesInString(attrValue); else result += m_xmlEntities.convertEntitiesInString(attrValue); } } result += "\""; } } // Do post action for open tag. String addedContents = postActionAfterSerializeOpenTag(element, param); // Complete the open tag for element when it has child/children. if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) result += ">"; // Append the added contents generate in post action of open tag. result += addedContents; // Save the result to data buffer. saveHTMLContentToBuffer(result, param); } // Serialize end tag of an specified element. void WebPageSerializerImpl::endTagToString(Element* element, SerializeDomParam* param) { bool needSkip; // Do pre action for end tag. String result = preActionBeforeSerializeEndTag(element, param, &needSkip); if (needSkip) return; // Write end tag when element has child/children. if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) { result += "</"; result += element->nodeName().lower(); result += ">"; } else { // Check whether we have to write end tag for empty element. if (param->isHTMLDocument) { result += ">"; // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. if (!static_cast<const HTMLElement*>(element)->ieForbidsInsertHTML()) { // We need to write end tag when it is required. result += "</"; result += element->nodeName().lower(); result += ">"; } } else { // For xml base document. result += " />"; } } // Do post action for end tag. result += postActionAfterSerializeEndTag(element, param); // Save the result to data buffer. saveHTMLContentToBuffer(result, param); } void WebPageSerializerImpl::buildContentForNode(Node* node, SerializeDomParam* param) { switch (node->nodeType()) { case Node::ELEMENT_NODE: // Process open tag of element. openTagToString(static_cast<Element*>(node), param); // Walk through the children nodes and process it. for (Node *child = node->firstChild(); child; child = child->nextSibling()) buildContentForNode(child, param); // Process end tag of element. endTagToString(static_cast<Element*>(node), param); break; case Node::TEXT_NODE: saveHTMLContentToBuffer(createMarkup(node), param); break; case Node::ATTRIBUTE_NODE: case Node::DOCUMENT_NODE: case Node::DOCUMENT_FRAGMENT_NODE: // Should not exist. ASSERT_NOT_REACHED(); break; // Document type node can be in DOM? case Node::DOCUMENT_TYPE_NODE: param->haveSeenDocType = true; default: // For other type node, call default action. saveHTMLContentToBuffer(createMarkup(node), param); break; } } WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, bool recursiveSerialization, WebPageSerializerClient* client, const WebVector<WebURL>& links, const WebVector<WebString>& localPaths, const WebString& localDirectoryName) : m_client(client) , m_recursiveSerialization(recursiveSerialization) , m_framesCollected(false) , m_localDirectoryName(localDirectoryName) , m_htmlEntities(false) , m_xmlEntities(true) { // Must specify available webframe. ASSERT(frame); m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame); // Make sure we have non 0 client. ASSERT(client); // Build local resources map. ASSERT(links.size() == localPaths.size()); for (size_t i = 0; i < links.size(); i++) { KURL url = links[i]; ASSERT(!m_localLinks.contains(url.string())); m_localLinks.set(url.string(), localPaths[i]); } ASSERT(m_dataBuffer.isEmpty()); } void WebPageSerializerImpl::collectTargetFrames() { ASSERT(!m_framesCollected); m_framesCollected = true; // First, process main frame. m_frames.append(m_specifiedWebFrameImpl); // Return now if user only needs to serialize specified frame, not including // all sub-frames. if (!m_recursiveSerialization) return; // Collect all frames inside the specified frame. for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { WebFrameImpl* currentFrame = m_frames[i]; // Get current using document. Document* currentDoc = currentFrame->frame()->document(); // Go through sub-frames. RefPtr<HTMLAllCollection> all = currentDoc->all(); for (Node* node = all->firstItem(); node; node = all->nextItem()) { if (!node->isHTMLElement()) continue; Element* element = static_cast<Element*>(node); WebFrameImpl* webFrame = WebFrameImpl::fromFrameOwnerElement(element); if (webFrame) m_frames.append(webFrame); } } } bool WebPageSerializerImpl::serialize() { if (!m_framesCollected) collectTargetFrames(); bool didSerialization = false; KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url(); for (unsigned i = 0; i < m_frames.size(); ++i) { WebFrameImpl* webFrame = m_frames[i]; Document* document = webFrame->frame()->document(); const KURL& url = document->url(); if (!url.isValid() || !m_localLinks.contains(url.string())) continue; didSerialization = true; String encoding = document->loader()->writer()->encoding(); const TextEncoding& textEncoding = encoding.isEmpty() ? UTF8Encoding() : TextEncoding(encoding); String directoryName = url == mainURL ? m_localDirectoryName : ""; SerializeDomParam param(url, textEncoding, document, directoryName); Element* documentElement = document->documentElement(); if (documentElement) buildContentForNode(documentElement, ¶m); encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); } ASSERT(m_dataBuffer.isEmpty()); m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); return didSerialization; } } // namespace WebKit