url_matcher.cc - Android社区 - https://www.androidos.net.cn/

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_matcher/url_matcher.h"

#include <algorithm>
#include <iterator>

#include "base/logging.h"
#include "url/gurl.h"
#include "url/url_canon.h"

namespace url_matcher {

// This set of classes implement a mapping of URL Component Patterns, such as
// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
// for use in substring comparisons.
//
// The idea of this mapping is to reduce the problem of comparing many
// URL Component Patterns against one URL to the problem of searching many
// substrings in one string:
//
// ----------------------                    -----------------
// | URL Query operator | ----translate----> | StringPattern |
// ----------------------                    -----------------
//                                                   ^
//                                                   |
//                                                compare
//                                                   |
//                                                   v
// ----------------------                    -----------------
// | URL to compare     |                    |               |
// | to all URL Query   | ----translate----> | String        |
// | operators          |                    |               |
// ----------------------                    -----------------
//
// The reason for this problem reduction is that there are efficient algorithms
// for searching many substrings in one string (see Aho-Corasick algorithm).
//
// Additionally, some of the same pieces are reused to implement regular
// expression comparisons. The FilteredRE2 implementation for matching many
// regular expressions against one string uses prefiltering, in which a set
// of substrings (derived from the regexes) are first searched for, to reduce
// the number of regular expressions to test; the prefiltering step also
// uses Aho-Corasick.
//
// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
// ==========================================================
//
// For searches in this class, we normalize URLs as follows:
//
// Step 1:
// Remove scheme, port and segment from URL:
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
//    www.example.com/index.html?search=foo
//
// We remove the scheme and port number because they can be checked later
// in a secondary filter step. We remove the segment (the #... part) because
// this is not guaranteed to be ASCII-7 encoded.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - ED = End of Domain
// - EP = End of Path
// - EU = End of URL
// Furthermore, the hostname is canonicalized to start with a ".".
//
// Position markers are represented as characters >127, which are therefore
// guaranteed not to be part of the ASCII-7 encoded URL character set.
//
// -> www.example.com/index.html?search=foo becomes
// BU .www.example.com ED /index.html EP ?search=foo EU
//
// -> www.example.com/index.html becomes
// BU .www.example.com ED /index.html EP EU
//
// Step 3:
// Translate URL Component Patterns as follows:
//
// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
// -> host_prefix("www.example") = BU .www.example
//
// host_suffix(suffix) = suffix ED
// -> host_suffix("example.com") = example.com ED
// -> host_suffix(".example.com") = .example.com ED
//
// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
// -> host_equals("www.example.com") = BU .www.example.com ED
//
// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
//
// With this, we can search the StringPatterns in the normalized URL.
//
//
// Case 2: url_{prefix,suffix,equals,contains} searches.
// =====================================================
//
// Step 1: as above, except that
// - the scheme is not removed
// - the port is not removed if it is specified and does not match the default
//   port for the given scheme.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - EU = End of URL
//
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
// BU http://www.example.com:8080/index.html?search=foo EU
// -> http://www.example.com:80/index.html?search=foo#first_match becomes
// BU http://www.example.com/index.html?search=foo EU
//
// url_prefix(prefix) = BU prefix
// -> url_prefix("http://www.example") = BU http://www.example
//
// url_contains(substring) = substring
// -> url_contains("index") = index
//
//
// Case 3: {host,path,query}_contains searches.
// ============================================
//
// These kinds of searches are not supported directly but can be derived
// by a combination of a url_contains() query followed by an explicit test:
//
// host_contains(str) = url_contains(str) followed by test whether str occurs
//   in host component of original URL.
// -> host_contains("example.co") = example.co
//    followed by gurl.host().find("example.co");
//
// [similarly for path_contains and query_contains].
//
//
// Regular expression matching (url_matches searches)
// ==================================================
//
// This class also supports matching regular expressions (RE2 syntax)
// against full URLs, which are transformed as in case 2.

namespace {

bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::URL_MATCHES;
}

bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
}

}  // namespace

//
// URLMatcherCondition
//

URLMatcherCondition::URLMatcherCondition()
    : criterion_(HOST_PREFIX),
      string_pattern_(NULL) {}

URLMatcherCondition::~URLMatcherCondition() {}

URLMatcherCondition::URLMatcherCondition(
    Criterion criterion,
    const StringPattern* string_pattern)
    : criterion_(criterion),
      string_pattern_(string_pattern) {}

URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    : criterion_(rhs.criterion_),
      string_pattern_(rhs.string_pattern_) {}

URLMatcherCondition& URLMatcherCondition::operator=(
    const URLMatcherCondition& rhs) {
  criterion_ = rhs.criterion_;
  string_pattern_ = rhs.string_pattern_;
  return *this;
}

bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
  if (criterion_ < rhs.criterion_) return true;
  if (criterion_ > rhs.criterion_) return false;
  if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    return *string_pattern_ < *rhs.string_pattern_;
  if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
  // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
  // or both are NULL.
  return false;
}

bool URLMatcherCondition::IsFullURLCondition() const {
  // For these criteria the SubstringMatcher needs to be executed on the
  // GURL that is canonicalized with
  // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
  switch (criterion_) {
    case HOST_CONTAINS:
    case PATH_CONTAINS:
    case QUERY_CONTAINS:
    case URL_PREFIX:
    case URL_SUFFIX:
    case URL_CONTAINS:
    case URL_EQUALS:
      return true;
    default:
      break;
  }
  return false;
}

bool URLMatcherCondition::IsRegexCondition() const {
  return IsRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
  return IsOriginAndPathRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  DCHECK(string_pattern_);
  if (!ContainsKey(matching_patterns, string_pattern_->id()))
    return false;
  // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
  // a substring match on the raw URL. In case of a match, we need to verify
  // that the match was found in the correct component of the URL.
  switch (criterion_) {
    case HOST_CONTAINS:
      return url.host().find(string_pattern_->pattern()) !=
          std::string::npos;
    case PATH_CONTAINS:
      return url.path().find(string_pattern_->pattern()) !=
          std::string::npos;
    case QUERY_CONTAINS:
      return url.query().find(string_pattern_->pattern()) !=
          std::string::npos;
    default:
      break;
  }
  return true;
}

//
// URLMatcherConditionFactory
//

namespace {
// These are symbols that are not contained in 7-bit ASCII used in GURLs.
const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
const char kEndOfDomain[] = {static_cast<char>(-2), 0};
const char kEndOfPath[] = {static_cast<char>(-3), 0};
const char kEndOfURL[] = {static_cast<char>(-4), 0};
}  // namespace

URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}

URLMatcherConditionFactory::~URLMatcherConditionFactory() {
  STLDeleteElements(&substring_pattern_singletons_);
  STLDeleteElements(®ex_pattern_singletons_);
  STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    const GURL& url) const {
  return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
         url.path() + kEndOfPath +
         (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::HOST_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(prefix));
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
      suffix + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS,
      kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::PATH_PREFIX,
      kEndOfDomain + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_EQUALS,
      kEndOfDomain + str + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    const std::string& prefix) {
  std::string pattern;
  if (!prefix.empty() && prefix[0] == '?')
    pattern = kEndOfPath + prefix;
  else
    pattern = kEndOfPath + ('?' + prefix);

return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    const std::string& suffix) {
  if (!suffix.empty() && suffix[0] == '?') {
    return CreateQueryEqualsCondition(suffix);
  } else {
    return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
                           suffix + kEndOfURL);
  }
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    const std::string& str) {
  if (!str.empty() && str[0] == '?')
    return CreateQueryPrefixCondition(str);
  else
    return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    const std::string& str) {
  std::string pattern;
  if (!str.empty() && str[0] == '?')
    pattern = kEndOfPath + str + kEndOfURL;
  else
    pattern = kEndOfPath + ('?' + str) + kEndOfURL;

return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
}

URLMatcherCondition
    URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    const std::string& host_suffix,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
      host_suffix + kEndOfDomain + path_prefix);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    const std::string& host,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
      path_prefix);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    const GURL& url) const {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
      kEndOfURL;
}

static std::string CanonicalizeURLForRegexSearchesHelper(
    const GURL& url,
    bool clear_query) {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  if (clear_query)
    replacements.ClearQuery();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return url.ReplaceComponents(replacements).spec();
}

std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, false);
}

std::string
URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, true);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::URL_PREFIX,
      kBeginningOfURL + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_EQUALS,
      kBeginningOfURL + str + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
}

void URLMatcherConditionFactory::ForgetUnusedPatterns(
      const std::set<StringPattern::ID>& used_patterns) {
  PatternSingletons::iterator i = substring_pattern_singletons_.begin();
  while (i != substring_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      substring_pattern_singletons_.erase(i++);
    }
  }
  i = regex_pattern_singletons_.begin();
  while (i != regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      regex_pattern_singletons_.erase(i++);
    }
  }
  i = origin_and_path_regex_pattern_singletons_.begin();
  while (i != origin_and_path_regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      origin_and_path_regex_pattern_singletons_.erase(i++);
    }
  }
}

bool URLMatcherConditionFactory::IsEmpty() const {
  return substring_pattern_singletons_.empty() &&
      regex_pattern_singletons_.empty() &&
      origin_and_path_regex_pattern_singletons_.empty();
}

URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    URLMatcherCondition::Criterion criterion,
    const std::string& pattern) {
  StringPattern search_pattern(pattern, 0);
  PatternSingletons* pattern_singletons = NULL;
  if (IsRegexCriterion(criterion))
    pattern_singletons = ®ex_pattern_singletons_;
  else if (IsOriginAndPathRegexCriterion(criterion))
    pattern_singletons = &origin_and_path_regex_pattern_singletons_;
  else
    pattern_singletons = &substring_pattern_singletons_;

PatternSingletons::const_iterator iter =
      pattern_singletons->find(&search_pattern);

if (iter != pattern_singletons->end()) {
    return URLMatcherCondition(criterion, *iter);
  } else {
    StringPattern* new_pattern =
        new StringPattern(pattern, id_counter_++);
    pattern_singletons->insert(new_pattern);
    return URLMatcherCondition(criterion, new_pattern);
  }
}

std::string URLMatcherConditionFactory::CanonicalizeHostname(
    const std::string& hostname) const {
  if (!hostname.empty() && hostname[0] == '.')
    return hostname;
  else
    return "." + hostname;
}

bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    StringPattern* lhs,
    StringPattern* rhs) const {
  if (lhs == NULL && rhs != NULL) return true;
  if (lhs != NULL && rhs != NULL)
    return lhs->pattern() < rhs->pattern();
  // Either both are NULL or only rhs is NULL.
  return false;
}

//
// URLMatcherSchemeFilter
//

URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    : filters_(1) {
  filters_.push_back(filter);
}

URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    const std::vector<std::string>& filters)
    : filters_(filters) {}

URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}

bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
  return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
      filters_.end();
}

//
// URLMatcherPortFilter
//

URLMatcherPortFilter::URLMatcherPortFilter(
    const std::vector<URLMatcherPortFilter::Range>& ranges)
    : ranges_(ranges) {}

URLMatcherPortFilter::~URLMatcherPortFilter() {}

bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
  int port = url.EffectiveIntPort();
  for (std::vector<Range>::const_iterator i = ranges_.begin();
       i != ranges_.end(); ++i) {
    if (i->first <= port && port <= i->second)
      return true;
  }
  return false;
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
                                                              int to) {
  return Range(from, to);
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
  return Range(port, port);
}

//
// URLMatcherConditionSet
//

URLMatcherConditionSet::~URLMatcherConditionSet() {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions)
    : id_(id),
      conditions_(conditions) {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions,
    scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    scoped_ptr<URLMatcherPortFilter> port_filter)
    : id_(id),
      conditions_(conditions),
      scheme_filter_(scheme_filter.Pass()),
      port_filter_(port_filter.Pass()) {}

bool URLMatcherConditionSet::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  for (Conditions::const_iterator i = conditions_.begin();
       i != conditions_.end(); ++i) {
    if (!i->IsMatch(matching_patterns, url))
      return false;
  }
  if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    return false;
  if (port_filter_.get() && !port_filter_->IsMatch(url))
    return false;
  return true;
}

//
// URLMatcher
//

URLMatcher::URLMatcher() {}

URLMatcher::~URLMatcher() {}

void URLMatcher::AddConditionSets(
    const URLMatcherConditionSet::Vector& condition_sets) {
  for (URLMatcherConditionSet::Vector::const_iterator i =
       condition_sets.begin(); i != condition_sets.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_[(*i)->id()] = *i;
  }
  UpdateInternalDatastructures();
}

void URLMatcher::RemoveConditionSets(
    const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
  for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
       condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find(*i) !=
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_.erase(*i);
  }
  UpdateInternalDatastructures();
}

void URLMatcher::ClearUnusedConditionSets() {
  UpdateConditionFactory();
}

std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    const GURL& url) const {
  // Find all IDs of StringPatterns that match |url|.
  // See URLMatcherConditionFactory for the canonicalization of URLs and the
  // distinction between full url searches and url component searches.
  std::set<StringPattern::ID> matches;
  if (!full_url_matcher_.IsEmpty()) {
    full_url_matcher_.Match(
        condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
  }
  if (!url_component_matcher_.IsEmpty()) {
    url_component_matcher_.Match(
        condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
  }
  if (!regex_set_matcher_.IsEmpty()) {
    regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
  }
  if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    origin_and_path_regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
        &matches);
  }

// Calculate all URLMatcherConditionSets for which all URLMatcherConditions
  // were fulfilled.
  std::set<URLMatcherConditionSet::ID> result;
  for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
       i != matches.end(); ++i) {
    // For each URLMatcherConditionSet there is exactly one condition
    // registered in substring_match_triggers_. This means that the following
    // logic tests each URLMatcherConditionSet exactly once if it can be
    // completely fulfilled.
    StringPatternTriggers::const_iterator triggered_condition_sets_iter =
        substring_match_triggers_.find(*i);
    if (triggered_condition_sets_iter == substring_match_triggers_.end())
      continue;  // Not all substring matches are triggers for a condition set.
    const std::set<URLMatcherConditionSet::ID>& condition_sets =
        triggered_condition_sets_iter->second;
    for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
         condition_sets.begin(); j != condition_sets.end(); ++j) {
      URLMatcherConditionSets::const_iterator condition_set_iter =
          url_matcher_condition_sets_.find(*j);
      DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
      if (condition_set_iter->second->IsMatch(matches, url))
        result.insert(*j);
    }
  }

return result;
}

bool URLMatcher::IsEmpty() const {
  return condition_factory_.IsEmpty() &&
      url_matcher_condition_sets_.empty() &&
      substring_match_triggers_.empty() &&
      full_url_matcher_.IsEmpty() &&
      url_component_matcher_.IsEmpty() &&
      regex_set_matcher_.IsEmpty() &&
      origin_and_path_regex_set_matcher_.IsEmpty() &&
      registered_full_url_patterns_.empty() &&
      registered_url_component_patterns_.empty();
}

void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
  // The purpose of |full_url_conditions| is just that we need to execute
  // the same logic once for Full URL searches and once for URL Component
  // searches (see URLMatcherConditionFactory).

// Determine which patterns need to be registered when this function
  // terminates.
  std::set<const StringPattern*> new_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      // If we are called to process Full URL searches, ignore others, and
      // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
      if (!condition_iter->IsRegexCondition() &&
          !condition_iter->IsOriginAndPathRegexCondition() &&
          full_url_conditions == condition_iter->IsFullURLCondition())
        new_patterns.insert(condition_iter->string_pattern());
    }
  }

// This is the set of patterns that were registered before this function
  // is called.
  std::set<const StringPattern*>& registered_patterns =
      full_url_conditions ? registered_full_url_patterns_
                          : registered_url_component_patterns_;

// Add all patterns that are in new_patterns but not in registered_patterns.
  std::vector<const StringPattern*> patterns_to_register =
      base::STLSetDifference<std::vector<const StringPattern*> >(
          new_patterns, registered_patterns);

// Remove all patterns that are in registered_patterns but not in
  // new_patterns.
  std::vector<const StringPattern*> patterns_to_unregister =
      base::STLSetDifference<std::vector<const StringPattern*> >(
           registered_patterns, new_patterns);

// Update the SubstringSetMatcher.
  SubstringSetMatcher& url_matcher =
      full_url_conditions ? full_url_matcher_ : url_component_matcher_;
  url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
                                            patterns_to_unregister);

// Update the set of registered_patterns for the next time this function
  // is being called.
  registered_patterns.swap(new_patterns);
}

void URLMatcher::UpdateRegexSetMatcher() {
  std::vector<const StringPattern*> new_patterns;
  std::vector<const StringPattern*> new_origin_and_path_patterns;

for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      if (condition_iter->IsRegexCondition()) {
        new_patterns.push_back(condition_iter->string_pattern());
      } else if (condition_iter->IsOriginAndPathRegexCondition()) {
        new_origin_and_path_patterns.push_back(
            condition_iter->string_pattern());
      }
    }
  }

// Start over from scratch. We can't really do better than this, since the
  // FilteredRE2 backend doesn't support incremental updates.
  regex_set_matcher_.ClearPatterns();
  regex_set_matcher_.AddPatterns(new_patterns);
  origin_and_path_regex_set_matcher_.ClearPatterns();
  origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
}

void URLMatcher::UpdateTriggers() {
  // Count substring pattern frequencies.
  std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      const StringPattern* pattern = condition_iter->string_pattern();
      substring_pattern_frequencies[pattern->id()]++;
    }
  }

// Update trigger conditions: Determine for each URLMatcherConditionSet which
  // URLMatcherCondition contains a StringPattern that occurs least
  // frequently in this URLMatcher. We assume that this condition is very
  // specific and occurs rarely in URLs. If a match occurs for this
  // URLMatcherCondition, we want to test all other URLMatcherCondition in the
  // respective URLMatcherConditionSet as well to see whether the entire
  // URLMatcherConditionSet is considered matching.
  substring_match_triggers_.clear();
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    if (conditions.empty())
      continue;
    URLMatcherConditionSet::Conditions::const_iterator condition_iter =
        conditions.begin();
    StringPattern::ID trigger = condition_iter->string_pattern()->id();
    // We skip the first element in the following loop.
    ++condition_iter;
    for (; condition_iter != conditions.end(); ++condition_iter) {
      StringPattern::ID current_id =
          condition_iter->string_pattern()->id();
      if (substring_pattern_frequencies[trigger] >
          substring_pattern_frequencies[current_id]) {
        trigger = current_id;
      }
    }
    substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
  }
}

void URLMatcher::UpdateConditionFactory() {
  std::set<StringPattern::ID> used_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      used_patterns.insert(condition_iter->string_pattern()->id());
    }
  }
  condition_factory_.ForgetUnusedPatterns(used_patterns);
}

void URLMatcher::UpdateInternalDatastructures() {
  UpdateSubstringSetMatcher(false);
  UpdateSubstringSetMatcher(true);
  UpdateRegexSetMatcher();
  UpdateTriggers();
  UpdateConditionFactory();
}

}  // namespace url_matcher

普通文本 | 881行 | 30.54 KB

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_matcher/url_matcher.h"

#include <algorithm>
#include <iterator>

#include "base/logging.h"
#include "url/gurl.h"
#include "url/url_canon.h"

namespace url_matcher {

// This set of classes implement a mapping of URL Component Patterns, such as
// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
// for use in substring comparisons.
//
// The idea of this mapping is to reduce the problem of comparing many
// URL Component Patterns against one URL to the problem of searching many
// substrings in one string:
//
// ----------------------                    -----------------
// | URL Query operator | ----translate----> | StringPattern |
// ----------------------                    -----------------
//                                                   ^
//                                                   |
//                                                compare
//                                                   |
//                                                   v
// ----------------------                    -----------------
// | URL to compare     |                    |               |
// | to all URL Query   | ----translate----> | String        |
// | operators          |                    |               |
// ----------------------                    -----------------
//
// The reason for this problem reduction is that there are efficient algorithms
// for searching many substrings in one string (see Aho-Corasick algorithm).
//
// Additionally, some of the same pieces are reused to implement regular
// expression comparisons. The FilteredRE2 implementation for matching many
// regular expressions against one string uses prefiltering, in which a set
// of substrings (derived from the regexes) are first searched for, to reduce
// the number of regular expressions to test; the prefiltering step also
// uses Aho-Corasick.
//
// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
// ==========================================================
//
// For searches in this class, we normalize URLs as follows:
//
// Step 1:
// Remove scheme, port and segment from URL:
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
//    www.example.com/index.html?search=foo
//
// We remove the scheme and port number because they can be checked later
// in a secondary filter step. We remove the segment (the #... part) because
// this is not guaranteed to be ASCII-7 encoded.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - ED = End of Domain
// - EP = End of Path
// - EU = End of URL
// Furthermore, the hostname is canonicalized to start with a ".".
//
// Position markers are represented as characters >127, which are therefore
// guaranteed not to be part of the ASCII-7 encoded URL character set.
//
// -> www.example.com/index.html?search=foo becomes
// BU .www.example.com ED /index.html EP ?search=foo EU
//
// -> www.example.com/index.html becomes
// BU .www.example.com ED /index.html EP EU
//
// Step 3:
// Translate URL Component Patterns as follows:
//
// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
// -> host_prefix("www.example") = BU .www.example
//
// host_suffix(suffix) = suffix ED
// -> host_suffix("example.com") = example.com ED
// -> host_suffix(".example.com") = .example.com ED
//
// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
// -> host_equals("www.example.com") = BU .www.example.com ED
//
// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
//
// With this, we can search the StringPatterns in the normalized URL.
//
//
// Case 2: url_{prefix,suffix,equals,contains} searches.
// =====================================================
//
// Step 1: as above, except that
// - the scheme is not removed
// - the port is not removed if it is specified and does not match the default
//   port for the given scheme.
//
// Step 2:
// Translate URL to String and add the following position markers:
// - BU = Beginning of URL
// - EU = End of URL
//
// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
// BU http://www.example.com:8080/index.html?search=foo EU
// -> http://www.example.com:80/index.html?search=foo#first_match becomes
// BU http://www.example.com/index.html?search=foo EU
//
// url_prefix(prefix) = BU prefix
// -> url_prefix("http://www.example") = BU http://www.example
//
// url_contains(substring) = substring
// -> url_contains("index") = index
//
//
// Case 3: {host,path,query}_contains searches.
// ============================================
//
// These kinds of searches are not supported directly but can be derived
// by a combination of a url_contains() query followed by an explicit test:
//
// host_contains(str) = url_contains(str) followed by test whether str occurs
//   in host component of original URL.
// -> host_contains("example.co") = example.co
//    followed by gurl.host().find("example.co");
//
// [similarly for path_contains and query_contains].
//
//
// Regular expression matching (url_matches searches)
// ==================================================
//
// This class also supports matching regular expressions (RE2 syntax)
// against full URLs, which are transformed as in case 2.

namespace {

bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::URL_MATCHES;
}

bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
  return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
}

}  // namespace

//
// URLMatcherCondition
//

URLMatcherCondition::URLMatcherCondition()
    : criterion_(HOST_PREFIX),
      string_pattern_(NULL) {}

URLMatcherCondition::~URLMatcherCondition() {}

URLMatcherCondition::URLMatcherCondition(
    Criterion criterion,
    const StringPattern* string_pattern)
    : criterion_(criterion),
      string_pattern_(string_pattern) {}

URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    : criterion_(rhs.criterion_),
      string_pattern_(rhs.string_pattern_) {}

URLMatcherCondition& URLMatcherCondition::operator=(
    const URLMatcherCondition& rhs) {
  criterion_ = rhs.criterion_;
  string_pattern_ = rhs.string_pattern_;
  return *this;
}

bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
  if (criterion_ < rhs.criterion_) return true;
  if (criterion_ > rhs.criterion_) return false;
  if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    return *string_pattern_ < *rhs.string_pattern_;
  if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
  // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
  // or both are NULL.
  return false;
}

bool URLMatcherCondition::IsFullURLCondition() const {
  // For these criteria the SubstringMatcher needs to be executed on the
  // GURL that is canonicalized with
  // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
  switch (criterion_) {
    case HOST_CONTAINS:
    case PATH_CONTAINS:
    case QUERY_CONTAINS:
    case URL_PREFIX:
    case URL_SUFFIX:
    case URL_CONTAINS:
    case URL_EQUALS:
      return true;
    default:
      break;
  }
  return false;
}

bool URLMatcherCondition::IsRegexCondition() const {
  return IsRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
  return IsOriginAndPathRegexCriterion(criterion_);
}

bool URLMatcherCondition::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  DCHECK(string_pattern_);
  if (!ContainsKey(matching_patterns, string_pattern_->id()))
    return false;
  // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
  // a substring match on the raw URL. In case of a match, we need to verify
  // that the match was found in the correct component of the URL.
  switch (criterion_) {
    case HOST_CONTAINS:
      return url.host().find(string_pattern_->pattern()) !=
          std::string::npos;
    case PATH_CONTAINS:
      return url.path().find(string_pattern_->pattern()) !=
          std::string::npos;
    case QUERY_CONTAINS:
      return url.query().find(string_pattern_->pattern()) !=
          std::string::npos;
    default:
      break;
  }
  return true;
}

//
// URLMatcherConditionFactory
//

namespace {
// These are symbols that are not contained in 7-bit ASCII used in GURLs.
const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
const char kEndOfDomain[] = {static_cast<char>(-2), 0};
const char kEndOfPath[] = {static_cast<char>(-3), 0};
const char kEndOfURL[] = {static_cast<char>(-4), 0};
}  // namespace

URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}

URLMatcherConditionFactory::~URLMatcherConditionFactory() {
  STLDeleteElements(&substring_pattern_singletons_);
  STLDeleteElements(&regex_pattern_singletons_);
  STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    const GURL& url) const {
  return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
         url.path() + kEndOfPath +
         (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::HOST_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(prefix));
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
      suffix + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS,
      kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::PATH_PREFIX,
      kEndOfDomain + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::PATH_EQUALS,
      kEndOfDomain + str + kEndOfPath);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    const std::string& prefix) {
  std::string pattern;
  if (!prefix.empty() && prefix[0] == '?')
    pattern = kEndOfPath + prefix;
  else
    pattern = kEndOfPath + ('?' + prefix);

  return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    const std::string& suffix) {
  if (!suffix.empty() && suffix[0] == '?') {
    return CreateQueryEqualsCondition(suffix);
  } else {
    return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
                           suffix + kEndOfURL);
  }
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    const std::string& str) {
  if (!str.empty() && str[0] == '?')
    return CreateQueryPrefixCondition(str);
  else
    return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    const std::string& str) {
  std::string pattern;
  if (!str.empty() && str[0] == '?')
    pattern = kEndOfPath + str + kEndOfURL;
  else
    pattern = kEndOfPath + ('?' + str) + kEndOfURL;

  return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
}

URLMatcherCondition
    URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    const std::string& host_suffix,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
      host_suffix + kEndOfDomain + path_prefix);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    const std::string& host,
    const std::string& path_prefix) {
  return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
      kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
      path_prefix);
}

std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    const GURL& url) const {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
      kEndOfURL;
}

static std::string CanonicalizeURLForRegexSearchesHelper(
    const GURL& url,
    bool clear_query) {
  GURL::Replacements replacements;
  replacements.ClearPassword();
  replacements.ClearUsername();
  replacements.ClearRef();
  if (clear_query)
    replacements.ClearQuery();
  // Clear port if it is implicit from scheme.
  if (url.has_port()) {
    const std::string& port = url.scheme();
    if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
            url.EffectiveIntPort()) {
      replacements.ClearPort();
    }
  }
  return url.ReplaceComponents(replacements).spec();
}

std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, false);
}

std::string
URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    const GURL& url) const {
  return CanonicalizeURLForRegexSearchesHelper(url, true);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    const std::string& prefix) {
  return CreateCondition(URLMatcherCondition::URL_PREFIX,
      kBeginningOfURL + prefix);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    const std::string& suffix) {
  return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    const std::string& str) {
  return CreateCondition(URLMatcherCondition::URL_EQUALS,
      kBeginningOfURL + str + kEndOfURL);
}

URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
}

URLMatcherCondition
URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    const std::string& regex) {
  return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
}

void URLMatcherConditionFactory::ForgetUnusedPatterns(
      const std::set<StringPattern::ID>& used_patterns) {
  PatternSingletons::iterator i = substring_pattern_singletons_.begin();
  while (i != substring_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      substring_pattern_singletons_.erase(i++);
    }
  }
  i = regex_pattern_singletons_.begin();
  while (i != regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      regex_pattern_singletons_.erase(i++);
    }
  }
  i = origin_and_path_regex_pattern_singletons_.begin();
  while (i != origin_and_path_regex_pattern_singletons_.end()) {
    if (ContainsKey(used_patterns, (*i)->id())) {
      ++i;
    } else {
      delete *i;
      origin_and_path_regex_pattern_singletons_.erase(i++);
    }
  }
}

bool URLMatcherConditionFactory::IsEmpty() const {
  return substring_pattern_singletons_.empty() &&
      regex_pattern_singletons_.empty() &&
      origin_and_path_regex_pattern_singletons_.empty();
}

URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    URLMatcherCondition::Criterion criterion,
    const std::string& pattern) {
  StringPattern search_pattern(pattern, 0);
  PatternSingletons* pattern_singletons = NULL;
  if (IsRegexCriterion(criterion))
    pattern_singletons = &regex_pattern_singletons_;
  else if (IsOriginAndPathRegexCriterion(criterion))
    pattern_singletons = &origin_and_path_regex_pattern_singletons_;
  else
    pattern_singletons = &substring_pattern_singletons_;

  PatternSingletons::const_iterator iter =
      pattern_singletons->find(&search_pattern);

  if (iter != pattern_singletons->end()) {
    return URLMatcherCondition(criterion, *iter);
  } else {
    StringPattern* new_pattern =
        new StringPattern(pattern, id_counter_++);
    pattern_singletons->insert(new_pattern);
    return URLMatcherCondition(criterion, new_pattern);
  }
}

std::string URLMatcherConditionFactory::CanonicalizeHostname(
    const std::string& hostname) const {
  if (!hostname.empty() && hostname[0] == '.')
    return hostname;
  else
    return "." + hostname;
}

bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    StringPattern* lhs,
    StringPattern* rhs) const {
  if (lhs == NULL && rhs != NULL) return true;
  if (lhs != NULL && rhs != NULL)
    return lhs->pattern() < rhs->pattern();
  // Either both are NULL or only rhs is NULL.
  return false;
}

//
// URLMatcherSchemeFilter
//

URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    : filters_(1) {
  filters_.push_back(filter);
}

URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    const std::vector<std::string>& filters)
    : filters_(filters) {}

URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}

bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
  return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
      filters_.end();
}

//
// URLMatcherPortFilter
//

URLMatcherPortFilter::URLMatcherPortFilter(
    const std::vector<URLMatcherPortFilter::Range>& ranges)
    : ranges_(ranges) {}

URLMatcherPortFilter::~URLMatcherPortFilter() {}

bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
  int port = url.EffectiveIntPort();
  for (std::vector<Range>::const_iterator i = ranges_.begin();
       i != ranges_.end(); ++i) {
    if (i->first <= port && port <= i->second)
      return true;
  }
  return false;
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
                                                              int to) {
  return Range(from, to);
}

// static
URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
  return Range(port, port);
}

//
// URLMatcherConditionSet
//

URLMatcherConditionSet::~URLMatcherConditionSet() {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions)
    : id_(id),
      conditions_(conditions) {}

URLMatcherConditionSet::URLMatcherConditionSet(
    ID id,
    const Conditions& conditions,
    scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    scoped_ptr<URLMatcherPortFilter> port_filter)
    : id_(id),
      conditions_(conditions),
      scheme_filter_(scheme_filter.Pass()),
      port_filter_(port_filter.Pass()) {}

bool URLMatcherConditionSet::IsMatch(
    const std::set<StringPattern::ID>& matching_patterns,
    const GURL& url) const {
  for (Conditions::const_iterator i = conditions_.begin();
       i != conditions_.end(); ++i) {
    if (!i->IsMatch(matching_patterns, url))
      return false;
  }
  if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    return false;
  if (port_filter_.get() && !port_filter_->IsMatch(url))
    return false;
  return true;
}

//
// URLMatcher
//

URLMatcher::URLMatcher() {}

URLMatcher::~URLMatcher() {}

void URLMatcher::AddConditionSets(
    const URLMatcherConditionSet::Vector& condition_sets) {
  for (URLMatcherConditionSet::Vector::const_iterator i =
       condition_sets.begin(); i != condition_sets.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_[(*i)->id()] = *i;
  }
  UpdateInternalDatastructures();
}

void URLMatcher::RemoveConditionSets(
    const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
  for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
       condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    DCHECK(url_matcher_condition_sets_.find(*i) !=
        url_matcher_condition_sets_.end());
    url_matcher_condition_sets_.erase(*i);
  }
  UpdateInternalDatastructures();
}

void URLMatcher::ClearUnusedConditionSets() {
  UpdateConditionFactory();
}

std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    const GURL& url) const {
  // Find all IDs of StringPatterns that match |url|.
  // See URLMatcherConditionFactory for the canonicalization of URLs and the
  // distinction between full url searches and url component searches.
  std::set<StringPattern::ID> matches;
  if (!full_url_matcher_.IsEmpty()) {
    full_url_matcher_.Match(
        condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
  }
  if (!url_component_matcher_.IsEmpty()) {
    url_component_matcher_.Match(
        condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
  }
  if (!regex_set_matcher_.IsEmpty()) {
    regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
  }
  if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    origin_and_path_regex_set_matcher_.Match(
        condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
        &matches);
  }

  // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
  // were fulfilled.
  std::set<URLMatcherConditionSet::ID> result;
  for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
       i != matches.end(); ++i) {
    // For each URLMatcherConditionSet there is exactly one condition
    // registered in substring_match_triggers_. This means that the following
    // logic tests each URLMatcherConditionSet exactly once if it can be
    // completely fulfilled.
    StringPatternTriggers::const_iterator triggered_condition_sets_iter =
        substring_match_triggers_.find(*i);
    if (triggered_condition_sets_iter == substring_match_triggers_.end())
      continue;  // Not all substring matches are triggers for a condition set.
    const std::set<URLMatcherConditionSet::ID>& condition_sets =
        triggered_condition_sets_iter->second;
    for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
         condition_sets.begin(); j != condition_sets.end(); ++j) {
      URLMatcherConditionSets::const_iterator condition_set_iter =
          url_matcher_condition_sets_.find(*j);
      DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
      if (condition_set_iter->second->IsMatch(matches, url))
        result.insert(*j);
    }
  }

  return result;
}

bool URLMatcher::IsEmpty() const {
  return condition_factory_.IsEmpty() &&
      url_matcher_condition_sets_.empty() &&
      substring_match_triggers_.empty() &&
      full_url_matcher_.IsEmpty() &&
      url_component_matcher_.IsEmpty() &&
      regex_set_matcher_.IsEmpty() &&
      origin_and_path_regex_set_matcher_.IsEmpty() &&
      registered_full_url_patterns_.empty() &&
      registered_url_component_patterns_.empty();
}

void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
  // The purpose of |full_url_conditions| is just that we need to execute
  // the same logic once for Full URL searches and once for URL Component
  // searches (see URLMatcherConditionFactory).

  // Determine which patterns need to be registered when this function
  // terminates.
  std::set<const StringPattern*> new_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      // If we are called to process Full URL searches, ignore others, and
      // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
      if (!condition_iter->IsRegexCondition() &&
          !condition_iter->IsOriginAndPathRegexCondition() &&
          full_url_conditions == condition_iter->IsFullURLCondition())
        new_patterns.insert(condition_iter->string_pattern());
    }
  }

  // This is the set of patterns that were registered before this function
  // is called.
  std::set<const StringPattern*>& registered_patterns =
      full_url_conditions ? registered_full_url_patterns_
                          : registered_url_component_patterns_;

  // Add all patterns that are in new_patterns but not in registered_patterns.
  std::vector<const StringPattern*> patterns_to_register =
      base::STLSetDifference<std::vector<const StringPattern*> >(
          new_patterns, registered_patterns);

  // Remove all patterns that are in registered_patterns but not in
  // new_patterns.
  std::vector<const StringPattern*> patterns_to_unregister =
      base::STLSetDifference<std::vector<const StringPattern*> >(
           registered_patterns, new_patterns);

  // Update the SubstringSetMatcher.
  SubstringSetMatcher& url_matcher =
      full_url_conditions ? full_url_matcher_ : url_component_matcher_;
  url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
                                            patterns_to_unregister);

  // Update the set of registered_patterns for the next time this function
  // is being called.
  registered_patterns.swap(new_patterns);
}

void URLMatcher::UpdateRegexSetMatcher() {
  std::vector<const StringPattern*> new_patterns;
  std::vector<const StringPattern*> new_origin_and_path_patterns;

  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      if (condition_iter->IsRegexCondition()) {
        new_patterns.push_back(condition_iter->string_pattern());
      } else if (condition_iter->IsOriginAndPathRegexCondition()) {
        new_origin_and_path_patterns.push_back(
            condition_iter->string_pattern());
      }
    }
  }

  // Start over from scratch. We can't really do better than this, since the
  // FilteredRE2 backend doesn't support incremental updates.
  regex_set_matcher_.ClearPatterns();
  regex_set_matcher_.AddPatterns(new_patterns);
  origin_and_path_regex_set_matcher_.ClearPatterns();
  origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
}

void URLMatcher::UpdateTriggers() {
  // Count substring pattern frequencies.
  std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      const StringPattern* pattern = condition_iter->string_pattern();
      substring_pattern_frequencies[pattern->id()]++;
    }
  }

  // Update trigger conditions: Determine for each URLMatcherConditionSet which
  // URLMatcherCondition contains a StringPattern that occurs least
  // frequently in this URLMatcher. We assume that this condition is very
  // specific and occurs rarely in URLs. If a match occurs for this
  // URLMatcherCondition, we want to test all other URLMatcherCondition in the
  // respective URLMatcherConditionSet as well to see whether the entire
  // URLMatcherConditionSet is considered matching.
  substring_match_triggers_.clear();
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    if (conditions.empty())
      continue;
    URLMatcherConditionSet::Conditions::const_iterator condition_iter =
        conditions.begin();
    StringPattern::ID trigger = condition_iter->string_pattern()->id();
    // We skip the first element in the following loop.
    ++condition_iter;
    for (; condition_iter != conditions.end(); ++condition_iter) {
      StringPattern::ID current_id =
          condition_iter->string_pattern()->id();
      if (substring_pattern_frequencies[trigger] >
          substring_pattern_frequencies[current_id]) {
        trigger = current_id;
      }
    }
    substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
  }
}

void URLMatcher::UpdateConditionFactory() {
  std::set<StringPattern::ID> used_patterns;
  for (URLMatcherConditionSets::const_iterator condition_set_iter =
      url_matcher_condition_sets_.begin();
      condition_set_iter != url_matcher_condition_sets_.end();
      ++condition_set_iter) {
    const URLMatcherConditionSet::Conditions& conditions =
        condition_set_iter->second->conditions();
    for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin(); condition_iter != conditions.end();
         ++condition_iter) {
      used_patterns.insert(condition_iter->string_pattern()->id());
    }
  }
  condition_factory_.ForgetUnusedPatterns(used_patterns);
}

void URLMatcher::UpdateInternalDatastructures() {
  UpdateSubstringSetMatcher(false);
  UpdateSubstringSetMatcher(true);
  UpdateRegexSetMatcher();
  UpdateTriggers();
  UpdateConditionFactory();
}

}  // namespace url_matcher

登录后可以享受更多权益