// Copyright 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/url_matcher/substring_set_matcher.h" #include <algorithm> #include <queue> #include "base/logging.h" #include "base/stl_util.h" namespace url_matcher { namespace { // Compare StringPattern instances based on their string patterns. bool ComparePatterns(const StringPattern* a, const StringPattern* b) { return a->pattern() < b->pattern(); } // Given the set of patterns, compute how many nodes will the corresponding // Aho-Corasick tree have. Note that |patterns| need to be sorted. uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { uint32 result = 1u; // 1 for the root node. if (patterns.empty()) return result; std::vector<const StringPattern*>::const_iterator last = patterns.begin(); std::vector<const StringPattern*>::const_iterator current = last + 1; // For the first pattern, each letter is a label of an edge to a new node. result += (*last)->pattern().size(); // For the subsequent patterns, only count the edges which were not counted // yet. For this it suffices to test against the previous pattern, because the // patterns are sorted. for (; current != patterns.end(); ++last, ++current) { const std::string& last_pattern = (*last)->pattern(); const std::string& current_pattern = (*current)->pattern(); const uint32 prefix_bound = std::min(last_pattern.size(), current_pattern.size()); uint32 common_prefix = 0; while (common_prefix < prefix_bound && last_pattern[common_prefix] == current_pattern[common_prefix]) ++common_prefix; result += current_pattern.size() - common_prefix; } return result; } } // namespace // // SubstringSetMatcher // SubstringSetMatcher::SubstringSetMatcher() { RebuildAhoCorasickTree(SubstringPatternVector()); } SubstringSetMatcher::~SubstringSetMatcher() {} void SubstringSetMatcher::RegisterPatterns( const std::vector<const StringPattern*>& patterns) { RegisterAndUnregisterPatterns(patterns, std::vector<const StringPattern*>()); } void SubstringSetMatcher::UnregisterPatterns( const std::vector<const StringPattern*>& patterns) { RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(), patterns); } void SubstringSetMatcher::RegisterAndUnregisterPatterns( const std::vector<const StringPattern*>& to_register, const std::vector<const StringPattern*>& to_unregister) { // Register patterns. for (std::vector<const StringPattern*>::const_iterator i = to_register.begin(); i != to_register.end(); ++i) { DCHECK(patterns_.find((*i)->id()) == patterns_.end()); patterns_[(*i)->id()] = *i; } // Unregister patterns for (std::vector<const StringPattern*>::const_iterator i = to_unregister.begin(); i != to_unregister.end(); ++i) { patterns_.erase((*i)->id()); } // Now we compute the total number of tree nodes needed. SubstringPatternVector sorted_patterns; sorted_patterns.resize(patterns_.size()); size_t next = 0; for (SubstringPatternMap::const_iterator i = patterns_.begin(); i != patterns_.end(); ++i, ++next) { sorted_patterns[next] = i->second; } std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); tree_.reserve(TreeSize(sorted_patterns)); RebuildAhoCorasickTree(sorted_patterns); } bool SubstringSetMatcher::Match(const std::string& text, std::set<StringPattern::ID>* matches) const { const size_t old_number_of_matches = matches->size(); // Handle patterns matching the empty string. matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); uint32 current_node = 0; for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { uint32 edge_from_current = tree_[current_node].GetEdge(*i); while (edge_from_current == AhoCorasickNode::kNoSuchEdge && current_node != 0) { current_node = tree_[current_node].failure(); edge_from_current = tree_[current_node].GetEdge(*i); } if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { current_node = edge_from_current; matches->insert(tree_[current_node].matches().begin(), tree_[current_node].matches().end()); } else { DCHECK_EQ(0u, current_node); } } return old_number_of_matches != matches->size(); } bool SubstringSetMatcher::IsEmpty() const { // An empty tree consists of only the root node. return patterns_.empty() && tree_.size() == 1u; } void SubstringSetMatcher::RebuildAhoCorasickTree( const SubstringPatternVector& sorted_patterns) { tree_.clear(); // Initialize root note of tree. AhoCorasickNode root; root.set_failure(0); tree_.push_back(root); // Insert all patterns. for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); i != sorted_patterns.end(); ++i) { InsertPatternIntoAhoCorasickTree(*i); } CreateFailureEdges(); } void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( const StringPattern* pattern) { const std::string& text = pattern->pattern(); const std::string::const_iterator text_end = text.end(); // Iterators on the tree and the text. uint32 current_node = 0; std::string::const_iterator i = text.begin(); // Follow existing paths for as long as possible. while (i != text_end) { uint32 edge_from_current = tree_[current_node].GetEdge(*i); if (edge_from_current == AhoCorasickNode::kNoSuchEdge) break; current_node = edge_from_current; ++i; } // Create new nodes if necessary. while (i != text_end) { tree_.push_back(AhoCorasickNode()); tree_[current_node].SetEdge(*i, tree_.size() - 1); current_node = tree_.size() - 1; ++i; } // Register match. tree_[current_node].AddMatch(pattern->id()); } void SubstringSetMatcher::CreateFailureEdges() { typedef AhoCorasickNode::Edges Edges; std::queue<uint32> queue; AhoCorasickNode& root = tree_[0]; root.set_failure(0); const Edges& root_edges = root.edges(); for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); ++e) { const uint32& leads_to = e->second; tree_[leads_to].set_failure(0); queue.push(leads_to); } while (!queue.empty()) { AhoCorasickNode& current_node = tree_[queue.front()]; queue.pop(); for (Edges::const_iterator e = current_node.edges().begin(); e != current_node.edges().end(); ++e) { const char& edge_label = e->first; const uint32& leads_to = e->second; queue.push(leads_to); uint32 failure = current_node.failure(); uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && failure != 0) { failure = tree_[failure].failure(); edge_from_failure = tree_[failure].GetEdge(edge_label); } const uint32 follow_in_case_of_failure = edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure : 0; tree_[leads_to].set_failure(follow_in_case_of_failure); tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); } } } const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() : failure_(kNoSuchEdge) {} SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( const SubstringSetMatcher::AhoCorasickNode& other) : edges_(other.edges_), failure_(other.failure_), matches_(other.matches_) {} SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::operator=( const SubstringSetMatcher::AhoCorasickNode& other) { edges_ = other.edges_; failure_ = other.failure_; matches_ = other.matches_; return *this; } uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { Edges::const_iterator i = edges_.find(c); return i == edges_.end() ? kNoSuchEdge : i->second; } void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { edges_[c] = node; } void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { matches_.insert(id); } void SubstringSetMatcher::AhoCorasickNode::AddMatches( const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { matches_.insert(matches.begin(), matches.end()); } } // namespace url_matcher