// synchronize.h // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Copyright 2005-2010 Google, Inc. // Author: allauzen@google.com (Cyril Allauzen) // // \file // Synchronize an FST with bounded delay. #ifndef FST_LIB_SYNCHRONIZE_H__ #define FST_LIB_SYNCHRONIZE_H__ #include <algorithm> #include <tr1/unordered_map> using std::tr1::unordered_map; using std::tr1::unordered_multimap; #include <tr1/unordered_set> using std::tr1::unordered_set; using std::tr1::unordered_multiset; #include <string> #include <utility> using std::pair; using std::make_pair; #include <vector> using std::vector; #include <fst/cache.h> #include <fst/test-properties.h> namespace fst { typedef CacheOptions SynchronizeFstOptions; // Implementation class for SynchronizeFst template <class A> class SynchronizeFstImpl : public CacheImpl<A> { public: using FstImpl<A>::SetType; using FstImpl<A>::SetProperties; using FstImpl<A>::SetInputSymbols; using FstImpl<A>::SetOutputSymbols; using CacheBaseImpl< CacheState<A> >::PushArc; using CacheBaseImpl< CacheState<A> >::HasArcs; using CacheBaseImpl< CacheState<A> >::HasFinal; using CacheBaseImpl< CacheState<A> >::HasStart; using CacheBaseImpl< CacheState<A> >::SetArcs; using CacheBaseImpl< CacheState<A> >::SetFinal; using CacheBaseImpl< CacheState<A> >::SetStart; typedef A Arc; typedef typename A::Label Label; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef basic_string<Label> String; struct Element { Element() {} Element(StateId s, const String *i, const String *o) : state(s), istring(i), ostring(o) {} StateId state; // Input state Id const String *istring; // Residual input labels const String *ostring; // Residual output labels // Residual strings are represented by const pointers to // basic_string<Label> and are stored in a hash_set. The pointed // memory is owned by the hash_set string_set_. }; SynchronizeFstImpl(const Fst<A> &fst, const SynchronizeFstOptions &opts) : CacheImpl<A>(opts), fst_(fst.Copy()) { SetType("synchronize"); uint64 props = fst.Properties(kFstProperties, false); SetProperties(SynchronizeProperties(props), kCopyProperties); SetInputSymbols(fst.InputSymbols()); SetOutputSymbols(fst.OutputSymbols()); } SynchronizeFstImpl(const SynchronizeFstImpl &impl) : CacheImpl<A>(impl), fst_(impl.fst_->Copy(true)) { SetType("synchronize"); SetProperties(impl.Properties(), kCopyProperties); SetInputSymbols(impl.InputSymbols()); SetOutputSymbols(impl.OutputSymbols()); } ~SynchronizeFstImpl() { delete fst_; // Extract pointers from the hash set vector<const String*> strings; typename StringSet::iterator it = string_set_.begin(); for (; it != string_set_.end(); ++it) strings.push_back(*it); // Free the extracted pointers for (size_t i = 0; i < strings.size(); ++i) delete strings[i]; } StateId Start() { if (!HasStart()) { StateId s = fst_->Start(); if (s == kNoStateId) return kNoStateId; const String *empty = FindString(new String()); StateId start = FindState(Element(fst_->Start(), empty, empty)); SetStart(start); } return CacheImpl<A>::Start(); } Weight Final(StateId s) { if (!HasFinal(s)) { const Element &e = elements_[s]; Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state); if ((w != Weight::Zero()) && (e.istring)->empty() && (e.ostring)->empty()) SetFinal(s, w); else SetFinal(s, Weight::Zero()); } return CacheImpl<A>::Final(s); } size_t NumArcs(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumArcs(s); } size_t NumInputEpsilons(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumInputEpsilons(s); } size_t NumOutputEpsilons(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumOutputEpsilons(s); } uint64 Properties() const { return Properties(kFstProperties); } // Set error if found; return FST impl properties. uint64 Properties(uint64 mask) const { if ((mask & kError) && fst_->Properties(kError, false)) SetProperties(kError, kError); return FstImpl<Arc>::Properties(mask); } void InitArcIterator(StateId s, ArcIteratorData<A> *data) { if (!HasArcs(s)) Expand(s); CacheImpl<A>::InitArcIterator(s, data); } // Returns the first character of the string obtained by // concatenating s and l. Label Car(const String *s, Label l = 0) const { if (!s->empty()) return (*s)[0]; else return l; } // Computes the residual string obtained by removing the first // character in the concatenation of s and l. const String *Cdr(const String *s, Label l = 0) { String *r = new String(); for (int i = 1; i < s->size(); ++i) r->push_back((*s)[i]); if (l && !(s->empty())) r->push_back(l); return FindString(r); } // Computes the concatenation of s and l. const String *Concat(const String *s, Label l = 0) { String *r = new String(); for (int i = 0; i < s->size(); ++i) r->push_back((*s)[i]); if (l) r->push_back(l); return FindString(r); } // Tests if the concatenation of s and l is empty bool Empty(const String *s, Label l = 0) const { if (s->empty()) return l == 0; else return false; } // Finds the string pointed by s in the hash set. Transfers the // pointer ownership to the hash set. const String *FindString(const String *s) { typename StringSet::iterator it = string_set_.find(s); if (it != string_set_.end()) { delete s; return (*it); } else { string_set_.insert(s); return s; } } // Finds state corresponding to an element. Creates new state // if element not found. StateId FindState(const Element &e) { typename ElementMap::iterator eit = element_map_.find(e); if (eit != element_map_.end()) { return (*eit).second; } else { StateId s = elements_.size(); elements_.push_back(e); element_map_.insert(pair<const Element, StateId>(e, s)); return s; } } // Computes the outgoing transitions from a state, creating new destination // states as needed. void Expand(StateId s) { Element e = elements_[s]; if (e.state != kNoStateId) for (ArcIterator< Fst<A> > ait(*fst_, e.state); !ait.Done(); ait.Next()) { const A &arc = ait.Value(); if (!Empty(e.istring, arc.ilabel) && !Empty(e.ostring, arc.olabel)) { const String *istring = Cdr(e.istring, arc.ilabel); const String *ostring = Cdr(e.ostring, arc.olabel); StateId d = FindState(Element(arc.nextstate, istring, ostring)); PushArc(s, Arc(Car(e.istring, arc.ilabel), Car(e.ostring, arc.olabel), arc.weight, d)); } else { const String *istring = Concat(e.istring, arc.ilabel); const String *ostring = Concat(e.ostring, arc.olabel); StateId d = FindState(Element(arc.nextstate, istring, ostring)); PushArc(s, Arc(0 , 0, arc.weight, d)); } } Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state); if ((w != Weight::Zero()) && ((e.istring)->size() + (e.ostring)->size() > 0)) { const String *istring = Cdr(e.istring); const String *ostring = Cdr(e.ostring); StateId d = FindState(Element(kNoStateId, istring, ostring)); PushArc(s, Arc(Car(e.istring), Car(e.ostring), w, d)); } SetArcs(s); } private: // Equality function for Elements, assume strings have been hashed. class ElementEqual { public: bool operator()(const Element &x, const Element &y) const { return x.state == y.state && x.istring == y.istring && x.ostring == y.ostring; } }; // Hash function for Elements to Fst states. class ElementKey { public: size_t operator()(const Element &x) const { size_t key = x.state; key = (key << 1) ^ (x.istring)->size(); for (size_t i = 0; i < (x.istring)->size(); ++i) key = (key << 1) ^ (*x.istring)[i]; key = (key << 1) ^ (x.ostring)->size(); for (size_t i = 0; i < (x.ostring)->size(); ++i) key = (key << 1) ^ (*x.ostring)[i]; return key; } }; // Equality function for strings class StringEqual { public: bool operator()(const String * const &x, const String * const &y) const { if (x->size() != y->size()) return false; for (size_t i = 0; i < x->size(); ++i) if ((*x)[i] != (*y)[i]) return false; return true; } }; // Hash function for set of strings class StringKey{ public: size_t operator()(const String * const & x) const { size_t key = x->size(); for (size_t i = 0; i < x->size(); ++i) key = (key << 1) ^ (*x)[i]; return key; } }; typedef unordered_map<Element, StateId, ElementKey, ElementEqual> ElementMap; typedef unordered_set<const String*, StringKey, StringEqual> StringSet; const Fst<A> *fst_; vector<Element> elements_; // mapping Fst state to Elements ElementMap element_map_; // mapping Elements to Fst state StringSet string_set_; void operator=(const SynchronizeFstImpl<A> &); // disallow }; // Synchronizes a transducer. This version is a delayed Fst. The // result will be an equivalent FST that has the property that during // the traversal of a path, the delay is either zero or strictly // increasing, where the delay is the difference between the number of // non-epsilon output labels and input labels along the path. // // For the algorithm to terminate, the input transducer must have // bounded delay, i.e., the delay of every cycle must be zero. // // Complexity: // - A has bounded delay: exponential // - A does not have bounded delay: does not terminate // // References: // - Mehryar Mohri. Edit-Distance of Weighted Automata: General // Definitions and Algorithms, International Journal of Computer // Science, 14(6): 957-982 (2003). // // This class attaches interface to implementation and handles // reference counting, delegating most methods to ImplToFst. template <class A> class SynchronizeFst : public ImplToFst< SynchronizeFstImpl<A> > { public: friend class ArcIterator< SynchronizeFst<A> >; friend class StateIterator< SynchronizeFst<A> >; typedef A Arc; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef CacheState<A> State; typedef SynchronizeFstImpl<A> Impl; SynchronizeFst(const Fst<A> &fst) : ImplToFst<Impl>(new Impl(fst, SynchronizeFstOptions())) {} SynchronizeFst(const Fst<A> &fst, const SynchronizeFstOptions &opts) : ImplToFst<Impl>(new Impl(fst, opts)) {} // See Fst<>::Copy() for doc. SynchronizeFst(const SynchronizeFst<A> &fst, bool safe = false) : ImplToFst<Impl>(fst, safe) {} // Get a copy of this SynchronizeFst. See Fst<>::Copy() for further doc. virtual SynchronizeFst<A> *Copy(bool safe = false) const { return new SynchronizeFst<A>(*this, safe); } virtual inline void InitStateIterator(StateIteratorData<A> *data) const; virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { GetImpl()->InitArcIterator(s, data); } private: // Makes visible to friends. Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } void operator=(const SynchronizeFst<A> &fst); // Disallow }; // Specialization for SynchronizeFst. template<class A> class StateIterator< SynchronizeFst<A> > : public CacheStateIterator< SynchronizeFst<A> > { public: explicit StateIterator(const SynchronizeFst<A> &fst) : CacheStateIterator< SynchronizeFst<A> >(fst, fst.GetImpl()) {} }; // Specialization for SynchronizeFst. template <class A> class ArcIterator< SynchronizeFst<A> > : public CacheArcIterator< SynchronizeFst<A> > { public: typedef typename A::StateId StateId; ArcIterator(const SynchronizeFst<A> &fst, StateId s) : CacheArcIterator< SynchronizeFst<A> >(fst.GetImpl(), s) { if (!fst.GetImpl()->HasArcs(s)) fst.GetImpl()->Expand(s); } private: DISALLOW_COPY_AND_ASSIGN(ArcIterator); }; template <class A> inline void SynchronizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const { data->base = new StateIterator< SynchronizeFst<A> >(*this); } // Synchronizes a transducer. This version writes the synchronized // result to a MutableFst. The result will be an equivalent FST that // has the property that during the traversal of a path, the delay is // either zero or strictly increasing, where the delay is the // difference between the number of non-epsilon output labels and // input labels along the path. // // For the algorithm to terminate, the input transducer must have // bounded delay, i.e., the delay of every cycle must be zero. // // Complexity: // - A has bounded delay: exponential // - A does not have bounded delay: does not terminate // // References: // - Mehryar Mohri. Edit-Distance of Weighted Automata: General // Definitions and Algorithms, International Journal of Computer // Science, 14(6): 957-982 (2003). template<class Arc> void Synchronize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst) { SynchronizeFstOptions opts; opts.gc_limit = 0; // Cache only the last state for fastest copy. *ofst = SynchronizeFst<Arc>(ifst, opts); } } // namespace fst #endif // FST_LIB_SYNCHRONIZE_H__