// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/browser/search_engines/template_url_parser.h" #include <algorithm> #include <map> #include <vector> #include "base/logging.h" #include "base/memory/scoped_ptr.h" #include "base/string_number_conversions.h" #include "base/string_util.h" #include "base/utf_string_conversions.h" #include "chrome/browser/search_engines/template_url.h" #include "chrome/common/url_constants.h" #include "googleurl/src/gurl.h" #include "libxml/parser.h" #include "libxml/xmlwriter.h" namespace { // // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds // to that of char, the following names are all in terms of char. This avoids // having to convert to wide, then do comparisons // Defines for element names of the OSD document: static const char kURLElement[] = "Url"; static const char kParamElement[] = "Param"; static const char kShortNameElement[] = "ShortName"; static const char kDescriptionElement[] = "Description"; static const char kImageElement[] = "Image"; static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; static const char kLanguageElement[] = "Language"; static const char kInputEncodingElement[] = "InputEncoding"; // Various XML attributes used. static const char kURLTypeAttribute[] = "type"; static const char kURLTemplateAttribute[] = "template"; static const char kImageTypeAttribute[] = "type"; static const char kImageWidthAttribute[] = "width"; static const char kImageHeightAttribute[] = "height"; static const char kURLIndexOffsetAttribute[] = "indexOffset"; static const char kURLPageOffsetAttribute[] = "pageOffset"; static const char kParamNameAttribute[] = "name"; static const char kParamValueAttribute[] = "value"; static const char kParamMethodAttribute[] = "method"; // Mime type for search results. static const char kHTMLType[] = "text/html"; // Mime type for as you type suggestions. static const char kSuggestionType[] = "application/x-suggestions+json"; // Namespace identifier. static const char kOSDNS[] = "xmlns"; // The namespace for documents we understand. static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/"; // Removes the namespace from the specified |name|, ex: os:Url -> Url. static void PruneNamespace(std::string* name) { size_t index = name->find_first_of(":"); if (index != std::string::npos) name->erase(0, index + 1); } // // To minimize memory overhead while parsing, a SAX style parser is used. // ParsingContext is used to maintain the state we're in the document // while parsing. class ParsingContext { public: // Enum of the known element types. enum ElementType { UNKNOWN, OPEN_SEARCH_DESCRIPTION, URL, PARAM, SHORT_NAME, DESCRIPTION, IMAGE, LANGUAGE, INPUT_ENCODING, }; enum Method { GET, POST }; // Key/value of a Param node. typedef std::pair<std::string, std::string> Param; ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter, TemplateURL* url) : url_(url), parameter_filter_(parameter_filter), method_(GET), suggestion_method_(GET), is_suggest_url_(false), derive_image_from_url_(false) { if (kElementNameToElementTypeMap == NULL) InitMapping(); } // Invoked when an element starts. void PushElement(const std::string& element) { ElementType type; if (kElementNameToElementTypeMap->find(element) == kElementNameToElementTypeMap->end()) { type = UNKNOWN; } else { type = (*kElementNameToElementTypeMap)[element]; } elements_.push_back(type); } void PopElement() { elements_.pop_back(); } // Returns the current ElementType. ElementType GetKnownType() { if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) return elements_[1]; // We only expect PARAM nodes under the Url node if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && elements_[1] == URL && elements_[2] == PARAM) return PARAM; return UNKNOWN; } TemplateURL* template_url() { return url_; } void AddImageRef(const std::string& type, int width, int height) { if (width > 0 && height > 0) current_image_.reset(new TemplateURL::ImageRef(type, width, height)); } void EndImage() { current_image_.reset(); } void SetImageURL(const GURL& url) { if (current_image_.get()) { current_image_->url = url; url_->add_image_ref(*current_image_); current_image_.reset(); } } void ResetString() { string_.clear(); } void AppendString(const string16& string) { string_ += string; } const string16& GetString() { return string_; } void ResetExtraParams() { extra_params_.clear(); } void AddExtraParams(const std::string& key, const std::string& value) { if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value)) return; extra_params_.push_back(Param(key, value)); } const std::vector<Param>& extra_params() const { return extra_params_; } void set_is_suggestion(bool value) { is_suggest_url_ = value; } bool is_suggestion() const { return is_suggest_url_; } TemplateURLParser::ParameterFilter* parameter_filter() const { return parameter_filter_; } void set_derive_image_from_url(bool derive_image_from_url) { derive_image_from_url_ = derive_image_from_url; } void set_method(Method method) { method_ = method; } Method method() { return method_; } void set_suggestion_method(Method method) { suggestion_method_ = method; } Method suggestion_method() { return suggestion_method_; } // Builds the image URL from the Template search URL if no image URL has been // set. void DeriveImageFromURL() { if (derive_image_from_url_ && url_->GetFaviconURL().is_empty() && url_->url()) { GURL url(url_->url()->url()); // More url's please... url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url)); } } private: static void InitMapping() { kElementNameToElementTypeMap = new std::map<std::string, ElementType>; (*kElementNameToElementTypeMap)[kURLElement] = URL; (*kElementNameToElementTypeMap)[kParamElement] = PARAM; (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION; (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = OPEN_SEARCH_DESCRIPTION; (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = OPEN_SEARCH_DESCRIPTION; (*kElementNameToElementTypeMap)[kLanguageElement] = LANGUAGE; (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; } // Key is UTF8 encoded. static std::map<std::string, ElementType>* kElementNameToElementTypeMap; // TemplateURL supplied to Read method. It's owned by the caller, so we // don't need to free it. TemplateURL* url_; std::vector<ElementType> elements_; scoped_ptr<TemplateURL::ImageRef> current_image_; // Character content for the current element. string16 string_; TemplateURLParser::ParameterFilter* parameter_filter_; // The list of parameters parsed in the Param nodes of a Url node. std::vector<Param> extra_params_; // The HTTP methods used. Method method_; Method suggestion_method_; // If true, we are currently parsing a suggest URL, otherwise it is an HTML // search. Note that we don't need a stack as Url nodes cannot be nested. bool is_suggest_url_; // Whether we should derive the image from the URL (when images are data // URLs). bool derive_image_from_url_; DISALLOW_COPY_AND_ASSIGN(ParsingContext); }; // static std::map<std::string, ParsingContext::ElementType>* ParsingContext::kElementNameToElementTypeMap = NULL; string16 XMLCharToUTF16(const xmlChar* value, int length) { return UTF8ToUTF16(std::string((const char*)value, length)); } std::string XMLCharToString(const xmlChar* value) { return std::string((const char*)value); } // Returns true if input_encoding contains a valid input encoding string. This // doesn't verify that we have a valid encoding for the string, just that the // string contains characters that constitute a valid input encoding. bool IsValidEncodingString(const std::string& input_encoding) { if (input_encoding.empty()) return false; if (!IsAsciiAlpha(input_encoding[0])) return false; for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { char c = input_encoding[i]; if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && c != '-') { return false; } } return true; } void ParseURL(const xmlChar** atts, ParsingContext* context) { if (!atts) return; TemplateURL* turl = context->template_url(); const xmlChar** attributes = atts; std::string template_url; bool is_post = false; bool is_html_url = false; bool is_suggest_url = false; int index_offset = 1; int page_offset = 1; while (*attributes) { std::string name(XMLCharToString(*attributes)); const xmlChar* value = attributes[1]; if (name == kURLTypeAttribute) { std::string type = XMLCharToString(value); is_html_url = (type == kHTMLType); is_suggest_url = (type == kSuggestionType); } else if (name == kURLTemplateAttribute) { template_url = XMLCharToString(value); } else if (name == kURLIndexOffsetAttribute) { base::StringToInt(XMLCharToString(value), &index_offset); index_offset = std::max(1, index_offset); } else if (name == kURLPageOffsetAttribute) { base::StringToInt(XMLCharToString(value), &page_offset); page_offset = std::max(1, page_offset); } else if (name == kParamMethodAttribute) { is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); } attributes += 2; } if (is_html_url) { turl->SetURL(template_url, index_offset, page_offset); context->set_is_suggestion(false); if (is_post) context->set_method(ParsingContext::POST); } else if (is_suggest_url) { turl->SetSuggestionsURL(template_url, index_offset, page_offset); context->set_is_suggestion(true); if (is_post) context->set_suggestion_method(ParsingContext::POST); } } void ParseImage(const xmlChar** atts, ParsingContext* context) { if (!atts) return; const xmlChar** attributes = atts; int width = 0; int height = 0; std::string type; while (*attributes) { std::string name(XMLCharToString(*attributes)); const xmlChar* value = attributes[1]; if (name == kImageTypeAttribute) { type = XMLCharToString(value); } else if (name == kImageWidthAttribute) { base::StringToInt(XMLCharToString(value), &width); } else if (name == kImageHeightAttribute) { base::StringToInt(XMLCharToString(value), &height); } attributes += 2; } if (width > 0 && height > 0 && !type.empty()) { // Valid Image URL. context->AddImageRef(type, width, height); } } void ParseParam(const xmlChar** atts, ParsingContext* context) { if (!atts) return; const xmlChar** attributes = atts; std::string key, value; while (*attributes) { std::string name(XMLCharToString(*attributes)); const xmlChar* val = attributes[1]; if (name == kParamNameAttribute) { key = XMLCharToString(val); } else if (name == kParamValueAttribute) { value = XMLCharToString(val); } attributes += 2; } if (!key.empty()) context->AddExtraParams(key, value); } static void AppendParamToQuery(const std::string& key, const std::string& value, std::string* query) { if (!query->empty()) query->append("&"); if (!key.empty()) { query->append(key); query->append("="); } query->append(value); } void ProcessURLParams(ParsingContext* context) { TemplateURL* t_url = context->template_url(); const TemplateURLRef* t_url_ref = context->is_suggestion() ? t_url->suggestions_url() : t_url->url(); if (!t_url_ref) return; if (!context->parameter_filter() && context->extra_params().empty()) return; GURL url(t_url_ref->url()); // If there is a parameter filter, parse the existing URL and remove any // unwanted parameter. TemplateURLParser::ParameterFilter* filter = context->parameter_filter(); std::string new_query; bool modified = false; if (filter) { url_parse::Component query = url.parsed_for_possibly_invalid_spec().query; url_parse::Component key, value; const char* url_spec = url.spec().c_str(); while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { std::string key_str(url_spec, key.begin, key.len); std::string value_str(url_spec, value.begin, value.len); if (filter->KeepParameter(key_str, value_str)) { AppendParamToQuery(key_str, value_str, &new_query); } else { modified = true; } } } if (!modified) new_query = url.query(); // Add the extra parameters if any. const std::vector<ParsingContext::Param>& params = context->extra_params(); if (!params.empty()) { modified = true; std::vector<ParsingContext::Param>::const_iterator iter; for (iter = params.begin(); iter != params.end(); ++iter) AppendParamToQuery(iter->first, iter->second, &new_query); } if (modified) { GURL::Replacements repl; repl.SetQueryStr(new_query); url = url.ReplaceComponents(repl); if (context->is_suggestion()) { t_url->SetSuggestionsURL(url.spec(), t_url_ref->index_offset(), t_url_ref->page_offset()); } else { t_url->SetURL(url.spec(), t_url_ref->index_offset(), t_url_ref->page_offset()); } } } void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) { ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); std::string node_name((const char*)name); PruneNamespace(&node_name); context->PushElement(node_name); switch (context->GetKnownType()) { case ParsingContext::URL: context->ResetExtraParams(); ParseURL(atts, context); break; case ParsingContext::IMAGE: ParseImage(atts, context); break; case ParsingContext::PARAM: ParseParam(atts, context); break; default: break; } context->ResetString(); } void EndElementImpl(void *ctx, const xmlChar *name) { ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); switch (context->GetKnownType()) { case ParsingContext::SHORT_NAME: context->template_url()->set_short_name(context->GetString()); break; case ParsingContext::DESCRIPTION: context->template_url()->set_description(context->GetString()); break; case ParsingContext::IMAGE: { GURL image_url(UTF16ToUTF8(context->GetString())); if (image_url.SchemeIs(chrome::kDataScheme)) { // TODO (jcampan): bug 1169256: when dealing with data URL, we need to // decode the data URL in the renderer. For now, we'll just point to the // favicon from the URL. context->set_derive_image_from_url(true); } else { context->SetImageURL(image_url); } context->EndImage(); break; } case ParsingContext::LANGUAGE: context->template_url()->add_language(context->GetString()); break; case ParsingContext::INPUT_ENCODING: { std::string input_encoding = UTF16ToASCII(context->GetString()); if (IsValidEncodingString(input_encoding)) context->template_url()->add_input_encoding(input_encoding); break; } case ParsingContext::URL: ProcessURLParams(context); break; default: break; } context->ResetString(); context->PopElement(); } void CharactersImpl(void *ctx, const xmlChar *ch, int len) { ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); context->AppendString(XMLCharToUTF16(ch, len)); } // Returns true if the ref is null, or the url wrapped by ref is // valid with a spec of http/https. bool IsHTTPRef(const TemplateURLRef* ref) { if (ref == NULL) return true; GURL url(ref->url()); return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) || url.SchemeIs(chrome::kHttpsScheme))); } // Returns true if the TemplateURL is legal. A legal TemplateURL is one // where all URLs have a spec of http/https. bool IsLegal(TemplateURL* url) { if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url())) return false; // Make sure all the image refs are legal. const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs(); for (size_t i = 0; i < image_refs.size(); i++) { GURL image_url(image_refs[i].url); if (!image_url.is_valid() || !(image_url.SchemeIs(chrome::kHttpScheme) || image_url.SchemeIs(chrome::kHttpsScheme))) { return false; } } return true; } } // namespace // static bool TemplateURLParser::Parse(const unsigned char* data, size_t length, TemplateURLParser::ParameterFilter* param_filter, TemplateURL* url) { DCHECK(url); // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to // & . Unfortunately xmlSubstituteEntitiesDefault effects global state. // If this becomes problematic we'll need to provide our own entity // type for &, or strip out " by hand after parsing. int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); ParsingContext context(param_filter, url); xmlSAXHandler sax_handler; memset(&sax_handler, 0, sizeof(sax_handler)); sax_handler.startElement = &StartElementImpl; sax_handler.endElement = &EndElementImpl; sax_handler.characters = &CharactersImpl; xmlSAXUserParseMemory(&sax_handler, &context, reinterpret_cast<const char*>(data), static_cast<int>(length)); xmlSubstituteEntitiesDefault(last_sub_entities_value); // If the image was a data URL, use the favicon from the search URL instead. // (see TODO inEndElementImpl()). context.DeriveImageFromURL(); // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines // that use POST yet. if (context.method() == ParsingContext::POST) return false; if (context.suggestion_method() == ParsingContext::POST) url->SetSuggestionsURL("", 0, 0); if (!url->short_name().empty() && !url->description().empty()) { // So far so good, make sure the urls are http. return IsLegal(url); } return false; }