// compact-fst.h // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Copyright 2005-2010 Google, Inc. // Author: allauzen@google.com (Cyril Allauzen) // // \file // FST Class for memory-efficient representation of common types of // FSTs: linear automata, acceptors, unweighted FSTs, ... #ifndef FST_LIB_COMPACT_FST_H__ #define FST_LIB_COMPACT_FST_H__ #include <iterator> #include <utility> using std::pair; using std::make_pair; #include <vector> using std::vector; #include <fst/cache.h> #include <fst/expanded-fst.h> #include <fst/fst-decl.h> // For optional argument declarations #include <fst/mapped-file.h> #include <fst/matcher.h> #include <fst/test-properties.h> #include <fst/util.h> namespace fst { struct CompactFstOptions : public CacheOptions { // CompactFst default caching behaviour is to do no caching. Most // compactors are cheap and therefore we save memory by not doing // caching. CompactFstOptions() : CacheOptions(true, 0) {} CompactFstOptions(const CacheOptions &opts) : CacheOptions(opts) {} }; // Compactor Interface - class determinies how arcs and final weights // are compacted and expanded. // // Final weights are treated as transitions to the superfinal state, // i.e. ilabel = olabel = kNoLabel and nextstate = kNoStateId. // // There are two types of compactors: // // * Fixed out-degree compactors: 'compactor.Size()' returns a // positive integer 's'. An FST can be compacted by this compactor // only if each state has exactly 's' outgoing transitions (counting a // non-Zero() final weight as a transition). A typical example is a // compactor for string FSTs, i.e. 's == 1'. // // * Variable out-degree compactors: 'compactor.Size() == -1'. There // are no out-degree restrictions for these compactors. // // // class Compactor { // public: // // Element is the type of the compacted transitions. // typedef ... Element; // // Return the compacted representation of a transition 'arc' // // at a state 's'. // Element Compact(StateId s, const Arc &arc); // // Return the transition at state 's' represented by the compacted // // transition 'e'. // Arc Expand(StateId s, const Element &e); // // Return -1 for variable out-degree compactors, and the mandatory // // out-degree otherwise. // ssize_t Size(); // // Test whether 'fst' can be compacted by this compactor. // bool Compatible(const Fst<A> &fst); // // Return the properties that are always true for an fst // // compacted using this compactor // uint64 Properties(); // // Return a string identifying the type of compactor. // static const string &Type(); // // Write a compactor to a file. // bool Write(ostream &strm); // // Read a compactor from a file. // static Compactor *Read(istream &strm); // // Default constructor (optional, see comment below). // Compactor(); // }; // // The default constructor is only required for FST_REGISTER to work // (i.e. enabling Convert() and the command-line utilities to work // with this new compactor). However, a default constructor always // needs to be specify for this code to compile, but one can have it // simply raised an error when called: // // Compactor::Compactor() { // FSTERROR() << "Compactor: no default constructor"; // } // Implementation data for Compact Fst, which can shared between otherwise // independent copies. // // The implementation contains two arrays: 'states_' and 'compacts_'. // // For fixed out-degree compactors, the 'states_' array is unallocated. // The 'compacts_' contains the compacted transitions. Its size is // 'ncompacts_'. The outgoing transitions at a given state are stored // consecutively. For a given state 's', its 'compactor.Size()' outgoing // transitions (including superfinal transition when 's' is final), are // stored in position ['s*compactor.Size()', '(s+1)*compactor_.Size()'). // // For variable out-degree compactors, the states_ array has size // 'nstates_ + 1' and contains pointers to positions into 'compacts_'. // For a given state 's', the compacted transitions of 's' are // stored in positions [ 'states_[s]', 'states_[s + 1]' ) in 'compacts_'. // By convention, 'states_[nstates_] == ncompacts_'. // // In both cases, the superfinal transitons (when 's' is final, i.e. // 'Final(s) != Weight::Zero()') is stored first. // // The unsigned type U is used to represent indices into the compacts_ // array. template <class E, class U> class CompactFstData { public: typedef E CompactElement; typedef U Unsigned; CompactFstData() : states_region_(0), compacts_region_(0), states_(0), compacts_(0), nstates_(0), ncompacts_(0), narcs_(0), start_(kNoStateId), error_(false) {} template <class A, class Compactor> CompactFstData(const Fst<A> &fst, const Compactor &compactor); template <class Iterator, class Compactor> CompactFstData(const Iterator &begin, const Iterator &end, const Compactor &compactor); ~CompactFstData() { if (states_region_ == NULL) { delete [] states_; } delete states_region_; if (compacts_region_ == NULL) { delete [] compacts_; } delete compacts_region_; } template <class Compactor> static CompactFstData<E, U> *Read(istream &strm, const FstReadOptions &opts, const FstHeader &hdr, const Compactor &compactor); bool Write(ostream &strm, const FstWriteOptions &opts) const; Unsigned States(ssize_t i) const { return states_[i]; } const CompactElement &Compacts(size_t i) const { return compacts_[i]; } size_t NumStates() const { return nstates_; } size_t NumCompacts() const { return ncompacts_; } size_t NumArcs() const { return narcs_; } ssize_t Start() const { return start_; } int RefCount() const { return ref_count_.count(); } int IncrRefCount() { return ref_count_.Incr(); } int DecrRefCount() { return ref_count_.Decr(); } bool Error() const { return error_; } private: MappedFile *states_region_; MappedFile *compacts_region_; Unsigned *states_; CompactElement *compacts_; size_t nstates_; size_t ncompacts_; size_t narcs_; ssize_t start_; RefCounter ref_count_; bool error_; }; template <class E, class U> template <class A, class C> CompactFstData<E, U>::CompactFstData(const Fst<A> &fst, const C &compactor) : states_region_(0), compacts_region_(0), states_(0), compacts_(0), nstates_(0), ncompacts_(0), narcs_(0), start_(kNoStateId), error_(false) { typedef typename A::StateId StateId; typedef typename A::Weight Weight; start_ = fst.Start(); // Count # of states and arcs. StateId nfinals = 0; for (StateIterator< Fst<A> > siter(fst); !siter.Done(); siter.Next()) { ++nstates_; StateId s = siter.Value(); for (ArcIterator< Fst<A> > aiter(fst, s); !aiter.Done(); aiter.Next()) ++narcs_; if (fst.Final(s) != Weight::Zero()) ++nfinals; } if (compactor.Size() == -1) { states_ = new Unsigned[nstates_ + 1]; ncompacts_ = narcs_ + nfinals; compacts_ = new CompactElement[ncompacts_]; states_[nstates_] = ncompacts_; } else { states_ = 0; ncompacts_ = nstates_ * compactor.Size(); if ((narcs_ + nfinals) != ncompacts_) { FSTERROR() << "CompactFstData: compactor incompatible with fst"; error_ = true; return; } compacts_ = new CompactElement[ncompacts_]; } size_t pos = 0, fpos = 0; for (StateId s = 0; s < nstates_; ++s) { fpos = pos; if (compactor.Size() == -1) states_[s] = pos; if (fst.Final(s) != Weight::Zero()) compacts_[pos++] = compactor.Compact(s, A(kNoLabel, kNoLabel, fst.Final(s), kNoStateId)); for (ArcIterator< Fst<A> > aiter(fst, s); !aiter.Done(); aiter.Next()) { compacts_[pos++] = compactor.Compact(s, aiter.Value()); } if ((compactor.Size() != -1) && ((pos - fpos) != compactor.Size())) { FSTERROR() << "CompactFstData: compactor incompatible with fst"; error_ = true; return; } } if (pos != ncompacts_) { FSTERROR() << "CompactFstData: compactor incompatible with fst"; error_ = true; return; } } template <class E, class U> template <class Iterator, class C> CompactFstData<E, U>::CompactFstData(const Iterator &begin, const Iterator &end, const C &compactor) : states_region_(0), compacts_region_(0), states_(0), compacts_(0), nstates_(0), ncompacts_(0), narcs_(0), start_(kNoStateId), error_(false) { typedef typename C::Arc Arc; typedef typename Arc::Weight Weight; if (compactor.Size() != -1) { ncompacts_ = distance(begin, end); if (compactor.Size() == 1) { // For strings, allow implicit final weight. // Empty input is the empty string. if (ncompacts_ == 0) { ++ncompacts_; } else { Arc arc = compactor.Expand(ncompacts_ - 1, *(begin + (ncompacts_ - 1))); if (arc.ilabel != kNoLabel) ++ncompacts_; } } if (ncompacts_ % compactor.Size()) { FSTERROR() << "CompactFstData: size of input container incompatible" << " with compactor"; error_ = true; return; } if (ncompacts_ == 0) return; start_ = 0; nstates_ = ncompacts_ / compactor.Size(); compacts_ = new CompactElement[ncompacts_]; size_t i = 0; Iterator it = begin; for(; it != end; ++it, ++i){ compacts_[i] = *it; if (compactor.Expand(i, *it).ilabel != kNoLabel) ++narcs_; } if (i < ncompacts_) compacts_[i] = compactor.Compact(i, Arc(kNoLabel, kNoLabel, Weight::One(), kNoStateId)); } else { if (distance(begin, end) == 0) return; // Count # of states, arcs and compacts. Iterator it = begin; for(size_t i = 0; it != end; ++it, ++i) { Arc arc = compactor.Expand(i, *it); if (arc.ilabel != kNoLabel) { ++narcs_; ++ncompacts_; } else { ++nstates_; if (arc.weight != Weight::Zero()) ++ncompacts_; } } start_ = 0; compacts_ = new CompactElement[ncompacts_]; states_ = new Unsigned[nstates_ + 1]; states_[nstates_] = ncompacts_; size_t i = 0, s = 0; for(it = begin; it != end; ++it) { Arc arc = compactor.Expand(i, *it); if (arc.ilabel != kNoLabel) { compacts_[i++] = *it; } else { states_[s++] = i; if (arc.weight != Weight::Zero()) compacts_[i++] = *it; } } if ((s != nstates_) || (i != ncompacts_)) { FSTERROR() << "CompactFstData: ill-formed input container"; error_ = true; return; } } } template <class E, class U> template <class C> CompactFstData<E, U> *CompactFstData<E, U>::Read( istream &strm, const FstReadOptions &opts, const FstHeader &hdr, const C &compactor) { CompactFstData<E, U> *data = new CompactFstData<E, U>(); data->start_ = hdr.Start(); data->nstates_ = hdr.NumStates(); data->narcs_ = hdr.NumArcs(); if (compactor.Size() == -1) { if ((hdr.GetFlags() & FstHeader::IS_ALIGNED) && !AlignInput(strm)) { LOG(ERROR) << "CompactFst::Read: Alignment failed: " << opts.source; delete data; return 0; } size_t b = (data->nstates_ + 1) * sizeof(Unsigned); data->states_region_ = MappedFile::Map(&strm, opts, b); if (!strm || data->states_region_ == NULL) { LOG(ERROR) << "CompactFst::Read: Read failed: " << opts.source; delete data; return 0; } data->states_ = static_cast<Unsigned *>( data->states_region_->mutable_data()); } else { data->states_ = 0; } data->ncompacts_ = compactor.Size() == -1 ? data->states_[data->nstates_] : data->nstates_ * compactor.Size(); if ((hdr.GetFlags() & FstHeader::IS_ALIGNED) && !AlignInput(strm)) { LOG(ERROR) << "CompactFst::Read: Alignment failed: " << opts.source; delete data; return 0; } size_t b = data->ncompacts_ * sizeof(CompactElement); data->compacts_region_ = MappedFile::Map(&strm, opts, b); if (!strm || data->compacts_region_ == NULL) { LOG(ERROR) << "CompactFst::Read: Read failed: " << opts.source; delete data; return 0; } data->compacts_ = static_cast<CompactElement *>( data->compacts_region_->mutable_data()); return data; } template<class E, class U> bool CompactFstData<E, U>::Write(ostream &strm, const FstWriteOptions &opts) const { if (states_) { if (opts.align && !AlignOutput(strm)) { LOG(ERROR) << "CompactFst::Write: Alignment failed: " << opts.source; return false; } strm.write(reinterpret_cast<char *>(states_), (nstates_ + 1) * sizeof(Unsigned)); } if (opts.align && !AlignOutput(strm)) { LOG(ERROR) << "CompactFst::Write: Alignment failed: " << opts.source; return false; } strm.write(reinterpret_cast<char *>(compacts_), ncompacts_ * sizeof(CompactElement)); strm.flush(); if (!strm) { LOG(ERROR) << "CompactFst::Write: Write failed: " << opts.source; return false; } return true; } template <class A, class C, class U> class CompactFst; template <class F, class G> void Cast(const F &, G *); // Implementation class for CompactFst, which contains CompactFstData // and Fst cache. template <class A, class C, class U> class CompactFstImpl : public CacheImpl<A> { public: using FstImpl<A>::SetType; using FstImpl<A>::SetProperties; using FstImpl<A>::Properties; using FstImpl<A>::SetInputSymbols; using FstImpl<A>::SetOutputSymbols; using FstImpl<A>::WriteHeader; using CacheImpl<A>::PushArc; using CacheImpl<A>::HasArcs; using CacheImpl<A>::HasFinal; using CacheImpl<A>::HasStart; using CacheImpl<A>::SetArcs; using CacheImpl<A>::SetFinal; using CacheImpl<A>::SetStart; typedef A Arc; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef C Compactor; typedef typename C::Element CompactElement; typedef U Unsigned; CompactFstImpl() : CacheImpl<A>(CompactFstOptions()), compactor_(0), own_compactor_(false), data_(0) { string type = "compact"; if (sizeof(U) != sizeof(uint32)) { string size; Int64ToStr(8 * sizeof(U), &size); type += size; } type += "_"; type += C::Type(); SetType(type); SetProperties(kNullProperties | kStaticProperties); } CompactFstImpl(const Fst<Arc> &fst, const C &compactor, const CompactFstOptions &opts) : CacheImpl<A>(opts), compactor_(new C(compactor)), own_compactor_(true), data_(0) { Init(fst); } CompactFstImpl(const Fst<Arc> &fst, C *compactor, const CompactFstOptions &opts) : CacheImpl<A>(opts), compactor_(compactor), own_compactor_(false), data_(0) { Init(fst); } template <class Iterator> CompactFstImpl(const Iterator &b, const Iterator &e, const C &compactor, const CompactFstOptions &opts) : CacheImpl<A>(opts), compactor_(new C(compactor)), own_compactor_(true), data_(0) { Init(b, e); } template <class Iterator> CompactFstImpl(const Iterator &b, const Iterator &e, C *compactor, const CompactFstOptions &opts) : CacheImpl<A>(opts), compactor_(compactor), own_compactor_(false), data_(0) { Init(b, e); } CompactFstImpl(const CompactFstImpl<A, C, U> &impl) : CacheImpl<A>(impl), compactor_(new C(*impl.compactor_)), own_compactor_(true), data_(impl.data_) { if (data_) data_->IncrRefCount(); SetType(impl.Type()); SetProperties(impl.Properties()); SetInputSymbols(impl.InputSymbols()); SetOutputSymbols(impl.OutputSymbols()); } ~CompactFstImpl(){ if (own_compactor_) delete compactor_; if (data_ && !data_->DecrRefCount()) delete data_; } StateId Start() { if (!HasStart()) { SetStart(data_->Start()); } return CacheImpl<A>::Start(); } Weight Final(StateId s) { if (HasFinal(s)) return CacheImpl<A>::Final(s); Arc arc(kNoLabel, kNoLabel, Weight::Zero(), kNoStateId); if ((compactor_->Size() != -1) || (data_->States(s) != data_->States(s + 1))) arc = ComputeArc(s, compactor_->Size() == -1 ? data_->States(s) : s * compactor_->Size()); return arc.ilabel == kNoLabel ? arc.weight : Weight::Zero(); } StateId NumStates() const { if (Properties(kError)) return 0; return data_->NumStates(); } size_t NumArcs(StateId s) { if (HasArcs(s)) return CacheImpl<A>::NumArcs(s); Unsigned i, num_arcs; if (compactor_->Size() == -1) { i = data_->States(s); num_arcs = data_->States(s + 1) - i; } else { i = s * compactor_->Size(); num_arcs = compactor_->Size(); } if (num_arcs > 0) { const A &arc = ComputeArc(s, i, kArcILabelValue); if (arc.ilabel == kNoStateId) { --num_arcs; } } return num_arcs; } size_t NumInputEpsilons(StateId s) { if (!HasArcs(s) && !Properties(kILabelSorted)) Expand(s); if (HasArcs(s)) return CacheImpl<A>::NumInputEpsilons(s); return CountEpsilons(s, false); } size_t NumOutputEpsilons(StateId s) { if (!HasArcs(s) && !Properties(kOLabelSorted)) Expand(s); if (HasArcs(s)) return CacheImpl<A>::NumOutputEpsilons(s); return CountEpsilons(s, true); } size_t CountEpsilons(StateId s, bool output_epsilons) { size_t begin = compactor_->Size() == -1 ? data_->States(s) : s * compactor_->Size(); size_t end = compactor_->Size() == -1 ? data_->States(s + 1) : (s + 1) * compactor_->Size(); size_t num_eps = 0; for (size_t i = begin; i < end; ++i) { const A &arc = ComputeArc( s, i, output_epsilons ? kArcOLabelValue : kArcILabelValue); const typename A::Label &label = (output_epsilons ? arc.olabel : arc.ilabel); if (label == kNoLabel) continue; else if (label > 0) break; ++num_eps; } return num_eps; } static CompactFstImpl<A, C, U> *Read(istream &strm, const FstReadOptions &opts) { CompactFstImpl<A, C, U> *impl = new CompactFstImpl<A, C, U>(); FstHeader hdr; if (!impl->ReadHeader(strm, opts, kMinFileVersion, &hdr)) { delete impl; return 0; } // Ensures compatibility if (hdr.Version() == kAlignedFileVersion) hdr.SetFlags(hdr.GetFlags() | FstHeader::IS_ALIGNED); impl->compactor_ = C::Read(strm); if (!impl->compactor_) { delete impl; return 0; } impl->own_compactor_ = true; impl->data_ = CompactFstData<CompactElement, U>::Read(strm, opts, hdr, *impl->compactor_); if (!impl->data_) { delete impl; return 0; } return impl; } bool Write(ostream &strm, const FstWriteOptions &opts) const { FstHeader hdr; hdr.SetStart(data_->Start()); hdr.SetNumStates(data_->NumStates()); hdr.SetNumArcs(data_->NumArcs()); // Ensures compatibility int file_version = opts.align ? kAlignedFileVersion : kFileVersion; WriteHeader(strm, opts, file_version, &hdr); compactor_->Write(strm); return data_->Write(strm, opts); } // Provide information needed for generic state iterator void InitStateIterator(StateIteratorData<A> *data) const { data->base = 0; data->nstates = data_->NumStates(); } void InitArcIterator(StateId s, ArcIteratorData<A> *data) { if (!HasArcs(s)) Expand(s); CacheImpl<A>::InitArcIterator(s, data); } Arc ComputeArc(StateId s, Unsigned i, uint32 f = kArcValueFlags) const { return compactor_->Expand(s, data_->Compacts(i), f); } void Expand(StateId s) { size_t begin = compactor_->Size() == -1 ? data_->States(s) : s * compactor_->Size(); size_t end = compactor_->Size() == -1 ? data_->States(s + 1) : (s + 1) * compactor_->Size(); for (size_t i = begin; i < end; ++i) { const Arc &arc = ComputeArc(s, i); if (arc.ilabel == kNoLabel) SetFinal(s, arc.weight); else PushArc(s, arc); } if (!HasFinal(s)) SetFinal(s, Weight::Zero()); SetArcs(s); } template <class Iterator> void SetCompactElements(const Iterator &b, const Iterator &e) { if (data_ && !data_->DecrRefCount()) delete data_; data_ = new CompactFstData<CompactElement, U>(b, e, *compactor_); } C *GetCompactor() const { return compactor_; } CompactFstData<CompactElement, U> *Data() const { return data_; } // Properties always true of this Fst class static const uint64 kStaticProperties = kExpanded; protected: template <class B, class D> explicit CompactFstImpl(const CompactFstImpl<B, D, U> &impl) : CacheImpl<A>(CacheOptions(impl.GetCacheGc(), impl.GetCacheLimit())), compactor_(new C(*impl.GetCompactor())), own_compactor_(true), data_(impl.Data()) { if (data_) data_->IncrRefCount(); SetType(impl.Type()); SetProperties(impl.Properties()); SetInputSymbols(impl.InputSymbols()); SetOutputSymbols(impl.OutputSymbols()); } private: friend class CompactFst<A, C, U>; // allow access during write. void Init(const Fst<Arc> &fst) { string type = "compact"; if (sizeof(U) != sizeof(uint32)) { string size; Int64ToStr(8 * sizeof(U), &size); type += size; } type += "_"; type += compactor_->Type(); SetType(type); SetInputSymbols(fst.InputSymbols()); SetOutputSymbols(fst.OutputSymbols()); data_ = new CompactFstData<CompactElement, U>(fst, *compactor_); if (data_->Error()) SetProperties(kError, kError); uint64 copy_properties = fst.Properties(kCopyProperties, true); if ((copy_properties & kError) || !compactor_->Compatible(fst)) { FSTERROR() << "CompactFstImpl: input fst incompatible with compactor"; SetProperties(kError, kError); return; } SetProperties(copy_properties | kStaticProperties); } template <class Iterator> void Init(const Iterator &b, const Iterator &e) { string type = "compact"; if (sizeof(U) != sizeof(uint32)) { string size; Int64ToStr(8 * sizeof(U), &size); type += size; } type += "_"; type += compactor_->Type(); SetType(type); SetProperties(kStaticProperties | compactor_->Properties()); data_ = new CompactFstData<CompactElement, U>(b, e, *compactor_); if (data_->Error()) SetProperties(kError, kError); } // Current unaligned file format version static const int kFileVersion = 2; // Current aligned file format version static const int kAlignedFileVersion = 1; // Minimum file format version supported static const int kMinFileVersion = 1; C *compactor_; bool own_compactor_; CompactFstData<CompactElement, U> *data_; }; template <class A, class C, class U> const uint64 CompactFstImpl<A, C, U>::kStaticProperties; template <class A, class C, class U> const int CompactFstImpl<A, C, U>::kFileVersion; template <class A, class C, class U> const int CompactFstImpl<A, C, U>::kAlignedFileVersion; template <class A, class C, class U> const int CompactFstImpl<A, C, U>::kMinFileVersion; // CompactFst. This class attaches interface to implementation and // handles reference counting, delegating most methods to // ImplToExpandedFst. The unsigned type U is used to represent indices // into the compact arc array (uint32 by default, declared in // fst-decl.h). template <class A, class C, class U> class CompactFst : public ImplToExpandedFst< CompactFstImpl<A, C, U> > { public: friend class StateIterator< CompactFst<A, C, U> >; friend class ArcIterator< CompactFst<A, C, U> >; template <class F, class G> void friend Cast(const F &, G *); typedef A Arc; typedef typename A::StateId StateId; typedef CompactFstImpl<A, C, U> Impl; typedef CacheState<A> State; typedef U Unsigned; CompactFst() : ImplToExpandedFst<Impl>(new Impl()) {} explicit CompactFst(const Fst<A> &fst, const C &compactor = C(), const CompactFstOptions &opts = CompactFstOptions()) : ImplToExpandedFst<Impl>(new Impl(fst, compactor, opts)) {} CompactFst(const Fst<A> &fst, C *compactor, const CompactFstOptions &opts = CompactFstOptions()) : ImplToExpandedFst<Impl>(new Impl(fst, compactor, opts)) {} // The following 2 constructors take as input two iterators delimiting // a set of (already) compacted transitions, starting with the // transitions out of the initial state. The format of the input // differs for fixed out-degree and variable out-degree compactors. // // - For fixed out-degree compactors, the final weight (encoded as a // compacted transition) needs to be given only for final // states. All strings (compactor of size 1) will be assume to be // terminated by a final state even when the final state is not // implicitely given. // // - For variable out-degree compactors, the final weight (encoded // as a compacted transition) needs to be given for all states and // must appeared first in the list (for state s, final weight of s, // followed by outgoing transitons in s). // // These 2 constructors allows the direct construction of a CompactFst // without first creating a more memory hungry 'regular' FST. This // is useful when memory usage is severely constrained. template <class Iterator> explicit CompactFst(const Iterator &begin, const Iterator &end, const C &compactor = C(), const CompactFstOptions &opts = CompactFstOptions()) : ImplToExpandedFst<Impl>(new Impl(begin, end, compactor, opts)) {} template <class Iterator> CompactFst(const Iterator &begin, const Iterator &end, C *compactor, const CompactFstOptions &opts = CompactFstOptions()) : ImplToExpandedFst<Impl>(new Impl(begin, end, compactor, opts)) {} // See Fst<>::Copy() for doc. CompactFst(const CompactFst<A, C, U> &fst, bool safe = false) : ImplToExpandedFst<Impl>(fst, safe) {} // Get a copy of this CompactFst. See Fst<>::Copy() for further doc. virtual CompactFst<A, C, U> *Copy(bool safe = false) const { return new CompactFst<A, C, U>(*this, safe); } // Read a CompactFst from an input stream; return NULL on error static CompactFst<A, C, U> *Read(istream &strm, const FstReadOptions &opts) { Impl* impl = Impl::Read(strm, opts); return impl ? new CompactFst<A, C, U>(impl) : 0; } // Read a CompactFst from a file; return NULL on error // Empty filename reads from standard input static CompactFst<A, C, U> *Read(const string &filename) { Impl* impl = ImplToExpandedFst<Impl>::Read(filename); return impl ? new CompactFst<A, C, U>(impl) : 0; } virtual bool Write(ostream &strm, const FstWriteOptions &opts) const { return GetImpl()->Write(strm, opts); } virtual bool Write(const string &filename) const { return Fst<A>::WriteFile(filename); } template <class F> static bool WriteFst(const F &fst, const C &compactor, ostream &strm, const FstWriteOptions &opts); virtual void InitStateIterator(StateIteratorData<A> *data) const { GetImpl()->InitStateIterator(data); } virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { GetImpl()->InitArcIterator(s, data); } virtual MatcherBase<A> *InitMatcher(MatchType match_type) const { return new SortedMatcher<CompactFst<A, C, U> >(*this, match_type); } template <class Iterator> void SetCompactElements(const Iterator &b, const Iterator &e) { GetImpl()->SetCompactElements(b, e); } private: CompactFst(Impl *impl) : ImplToExpandedFst<Impl>(impl) {} // Makes visible to friends. Impl *GetImpl() const { return ImplToFst<Impl, ExpandedFst<A> >::GetImpl(); } void SetImpl(Impl *impl, bool own_impl = false) { ImplToFst< Impl, ExpandedFst<A> >::SetImpl(impl, own_impl); } // Use overloading to extract the type of the argument. static Impl* GetImplIfCompactFst(const CompactFst<A, C, U> &compact_fst) { return compact_fst.GetImpl(); } // This does not give privileged treatment to subclasses of CompactFst. template<typename NonCompactFst> static Impl* GetImplIfCompactFst(const NonCompactFst& fst) { return NULL; } void operator=(const CompactFst<A, C, U> &fst); // disallow }; // Writes Fst in Compact format, potentially with a pass over the machine // before writing to compute the number of states and arcs. // template <class A, class C, class U> template <class F> bool CompactFst<A, C, U>::WriteFst(const F &fst, const C &compactor, ostream &strm, const FstWriteOptions &opts) { typedef U Unsigned; typedef typename C::Element CompactElement; typedef typename A::Weight Weight; int file_version = opts.align ? CompactFstImpl<A, C, U>::kAlignedFileVersion : CompactFstImpl<A, C, U>::kFileVersion; size_t num_arcs = -1, num_states = -1, num_compacts = -1; C first_pass_compactor = compactor; if (Impl* impl = GetImplIfCompactFst(fst)) { num_arcs = impl->Data()->NumArcs(); num_states = impl->Data()->NumStates(); num_compacts = impl->Data()->NumCompacts(); first_pass_compactor = *impl->GetCompactor(); } else { // A first pass is needed to compute the state of the compactor, which // is saved ahead of the rest of the data structures. This unfortunately // means forcing a complete double compaction when writing in this format. // TODO(allauzen): eliminate mutable state from compactors. num_arcs = 0; num_states = 0; for (StateIterator<F> siter(fst); !siter.Done(); siter.Next()) { const StateId s = siter.Value(); ++num_states; if (fst.Final(s) != Weight::Zero()) { first_pass_compactor.Compact( s, A(kNoLabel, kNoLabel, fst.Final(s), kNoStateId)); } for (ArcIterator<F> aiter(fst, s); !aiter.Done(); aiter.Next()) { ++num_arcs; first_pass_compactor.Compact(s, aiter.Value()); } } } FstHeader hdr; hdr.SetStart(fst.Start()); hdr.SetNumStates(num_states); hdr.SetNumArcs(num_arcs); string type = "compact"; if (sizeof(U) != sizeof(uint32)) { string size; Int64ToStr(8 * sizeof(U), &size); type += size; } type += "_"; type += C::Type(); uint64 copy_properties = fst.Properties(kCopyProperties, true); if ((copy_properties & kError) || !compactor.Compatible(fst)) { LOG(ERROR) << "fst incompatible with compactor"; return false; } uint64 properties = copy_properties | CompactFstImpl<A, C, U>::kStaticProperties; FstImpl<A>::WriteFstHeader(fst, strm, opts, file_version, type, properties, &hdr); first_pass_compactor.Write(strm); if (first_pass_compactor.Size() == -1) { if (opts.align && !AlignOutput(strm)) { LOG(ERROR) << "CompactFst::Write: Alignment failed: " << opts.source; return false; } Unsigned compacts = 0; for (StateIterator<F> siter(fst); !siter.Done(); siter.Next()) { const StateId s = siter.Value(); strm.write(reinterpret_cast<const char *>(&compacts), sizeof(compacts)); if (fst.Final(s) != Weight::Zero()) { ++compacts; } compacts += fst.NumArcs(s); } strm.write(reinterpret_cast<const char *>(&compacts), sizeof(compacts)); } if (opts.align && !AlignOutput(strm)) { LOG(ERROR) << "Could not align file during write after writing states"; } C second_pass_compactor = compactor; CompactElement element; for (StateIterator<F> siter(fst); !siter.Done(); siter.Next()) { const StateId s = siter.Value(); if (fst.Final(s) != Weight::Zero()) { element = second_pass_compactor.Compact( s, A(kNoLabel, kNoLabel, fst.Final(s), kNoStateId)); strm.write(reinterpret_cast<const char *>(&element), sizeof(element)); } for (ArcIterator<F> aiter(fst, s); !aiter.Done(); aiter.Next()) { element = second_pass_compactor.Compact(s, aiter.Value()); strm.write(reinterpret_cast<const char *>(&element), sizeof(element)); } } strm.flush(); if (!strm) { LOG(ERROR) << "CompactFst write failed: " << opts.source; return false; } return true; } // Specialization for CompactFst; see generic version in fst.h // for sample usage (but use the CompactFst type!). This version // should inline. template <class A, class C, class U> class StateIterator< CompactFst<A, C, U> > { public: typedef typename A::StateId StateId; explicit StateIterator(const CompactFst<A, C, U> &fst) : nstates_(fst.GetImpl()->NumStates()), s_(0) {} bool Done() const { return s_ >= nstates_; } StateId Value() const { return s_; } void Next() { ++s_; } void Reset() { s_ = 0; } private: StateId nstates_; StateId s_; DISALLOW_COPY_AND_ASSIGN(StateIterator); }; // Specialization for CompactFst. // Never caches, always iterates over the underlying compact elements. template <class A, class C, class U> class ArcIterator< CompactFst<A, C, U> > { public: typedef typename A::StateId StateId; typedef typename C::Element CompactElement; ArcIterator(const CompactFst<A, C, U> &fst, StateId s) : compactor_(fst.GetImpl()->GetCompactor()), state_(s), compacts_(0), pos_(0), flags_(kArcValueFlags) { const CompactFstData<CompactElement, U> *data = fst.GetImpl()->Data(); size_t offset; if (compactor_->Size() == -1) { // Variable out-degree compactor offset = data->States(s); num_arcs_ = data->States(s + 1) - offset; } else { // Fixed out-degree compactor offset = s * compactor_->Size(); num_arcs_ = compactor_->Size(); } if (num_arcs_ > 0) { compacts_ = &(data->Compacts(offset)); arc_ = compactor_->Expand(s, *compacts_, kArcILabelValue); if (arc_.ilabel == kNoStateId) { ++compacts_; --num_arcs_; } } } ~ArcIterator() {} bool Done() const { return pos_ >= num_arcs_; } const A& Value() const { arc_ = compactor_->Expand(state_, compacts_[pos_], flags_); return arc_; } void Next() { ++pos_; } size_t Position() const { return pos_; } void Reset() { pos_ = 0; } void Seek(size_t pos) { pos_ = pos; } uint32 Flags() const { return flags_; } void SetFlags(uint32 f, uint32 m) { flags_ &= ~m; flags_ |= (f & kArcValueFlags); } private: C *compactor_; StateId state_; const CompactElement *compacts_; size_t pos_; size_t num_arcs_; mutable A arc_; uint32 flags_; DISALLOW_COPY_AND_ASSIGN(ArcIterator); }; // // Specialization for CompactFst. // // This is an optionally caching arc iterator. // // TODO(allauzen): implements the kArcValueFlags, the current // // implementation only implements the kArcNoCache flag. // template <class A, class C, class U> // class ArcIterator< CompactFst<A, C, U> > { // public: // typedef typename A::StateId StateId; // ArcIterator(const CompactFst<A, C, U> &fst, StateId s) // : fst_(fst), state_(s), pos_(0), num_arcs_(0), offset_(0), // flags_(kArcValueFlags) { // cache_data_.ref_count = 0; // if (fst_.GetImpl()->HasArcs(state_)) { // fst_.GetImpl()->InitArcIterator(s, &cache_data_); // num_arcs_ = cache_data_.narcs; // return; // } // const C *compactor = fst_.GetImpl()->GetCompactor(); // const CompactFstData<A, C, U> *data = fst_.GetImpl()->Data(); // if (compactor->Size() == -1) { // Variable out-degree compactor // offset_ = data->States(s); // num_arcs_ = data->States(s + 1) - offset_; // } else { // Fixed out-degree compactor // offset_ = s * compactor->Size(); // num_arcs_ = compactor->Size(); // } // if (num_arcs_ > 0) { // const A &arc = fst_.GetImpl()->ComputeArc(s, offset_); // if (arc.ilabel == kNoStateId) { // ++offset_; // --num_arcs_; // } // } // } // ~ArcIterator() { // if (cache_data_.ref_count) // --(*cache_data_.ref_count); // } // bool Done() const { return pos_ >= num_arcs_; } // const A& Value() const { // if (cache_data_.ref_count == 0) { // if (flags_ & kArcNoCache) { // arc_ = fst_.GetImpl()->ComputeArc(state_, pos_ + offset_); // return arc_; // } else { // fst_.GetImpl()->InitArcIterator(state_, &cache_data_); // } // } // return cache_data_.arcs[pos_]; // } // void Next() { ++pos_; } // size_t Position() const { return pos_; } // void Reset() { pos_ = 0; } // void Seek(size_t pos) { pos_ = pos; } // uint32 Flags() const { return flags_; } // void SetFlags(uint32 f, uint32 m) { // flags_ &= ~m; // flags_ |= f; // if (!(flags_ & kArcNoCache) && cache_data_.ref_count == 0) // fst_.GetImpl()->InitArcIterator(state_, &cache_data_); // } // private: // mutable const CompactFst<A, C, U> &fst_; // StateId state_; // size_t pos_; // size_t num_arcs_; // size_t offset_; // uint32 flags_; // mutable A arc_; // mutable ArcIteratorData<A> cache_data_; // DISALLOW_COPY_AND_ASSIGN(ArcIterator); // }; // // Utility Compactors // // Compactor for unweighted string FSTs template <class A> class StringCompactor { public: typedef A Arc; typedef typename A::Label Element; typedef typename A::Label Label; typedef typename A::StateId StateId; typedef typename A::Weight Weight; Element Compact(StateId s, const A &arc) const { return arc.ilabel; } Arc Expand(StateId s, const Element &p, uint32 f = kArcValueFlags) const { return Arc(p, p, Weight::One(), p != kNoLabel ? s + 1 : kNoStateId); } ssize_t Size() const { return 1; } uint64 Properties() const { return kString | kAcceptor | kUnweighted; } bool Compatible(const Fst<A> &fst) const { uint64 props = Properties(); return fst.Properties(props, true) == props; } static const string &Type() { static const string type = "string"; return type; } bool Write(ostream &strm) const { return true; } static StringCompactor *Read(istream &strm) { return new StringCompactor; } }; // Compactor for weighted string FSTs template <class A> class WeightedStringCompactor { public: typedef A Arc; typedef typename A::Label Label; typedef typename A::StateId StateId; typedef typename A::Weight Weight; typedef pair<Label, Weight> Element; Element Compact(StateId s, const A &arc) const { return make_pair(arc.ilabel, arc.weight); } Arc Expand(StateId s, const Element &p, uint32 f = kArcValueFlags) const { return Arc(p.first, p.first, p.second, p.first != kNoLabel ? s + 1 : kNoStateId); } ssize_t Size() const { return 1;} uint64 Properties() const { return kString | kAcceptor; } bool Compatible(const Fst<A> &fst) const { uint64 props = Properties(); return fst.Properties(props, true) == props; } static const string &Type() { static const string type = "weighted_string"; return type; } bool Write(ostream &strm) const { return true; } static WeightedStringCompactor *Read(istream &strm) { return new WeightedStringCompactor; } }; // Compactor for unweighted acceptor FSTs template <class A> class UnweightedAcceptorCompactor { public: typedef A Arc; typedef typename A::Label Label; typedef typename A::StateId StateId; typedef typename A::Weight Weight; typedef pair<Label, StateId> Element; Element Compact(StateId s, const A &arc) const { return make_pair(arc.ilabel, arc.nextstate); } Arc Expand(StateId s, const Element &p, uint32 f = kArcValueFlags) const { return Arc(p.first, p.first, Weight::One(), p.second); } ssize_t Size() const { return -1;} uint64 Properties() const { return kAcceptor | kUnweighted; } bool Compatible(const Fst<A> &fst) const { uint64 props = Properties(); return fst.Properties(props, true) == props; } static const string &Type() { static const string type = "unweighted_acceptor"; return type; } bool Write(ostream &strm) const { return true; } static UnweightedAcceptorCompactor *Read(istream &istrm) { return new UnweightedAcceptorCompactor; } }; // Compactor for weighted acceptor FSTs template <class A> class AcceptorCompactor { public: typedef A Arc; typedef typename A::Label Label; typedef typename A::StateId StateId; typedef typename A::Weight Weight; typedef pair< pair<Label, Weight>, StateId > Element; Element Compact(StateId s, const A &arc) const { return make_pair(make_pair(arc.ilabel, arc.weight), arc.nextstate); } Arc Expand(StateId s, const Element &p, uint32 f = kArcValueFlags) const { return Arc(p.first.first, p.first.first, p.first.second, p.second); } ssize_t Size() const { return -1;} uint64 Properties() const { return kAcceptor; } bool Compatible(const Fst<A> &fst) const { uint64 props = Properties(); return fst.Properties(props, true) == props; } static const string &Type() { static const string type = "acceptor"; return type; } bool Write(ostream &strm) const { return true; } static AcceptorCompactor *Read(istream &strm) { return new AcceptorCompactor; } }; // Compactor for unweighted FSTs template <class A> class UnweightedCompactor { public: typedef A Arc; typedef typename A::Label Label; typedef typename A::StateId StateId; typedef typename A::Weight Weight; typedef pair< pair<Label, Label>, StateId > Element; Element Compact(StateId s, const A &arc) const { return make_pair(make_pair(arc.ilabel, arc.olabel), arc.nextstate); } Arc Expand(StateId s, const Element &p, uint32 f = kArcValueFlags) const { return Arc(p.first.first, p.first.second, Weight::One(), p.second); } ssize_t Size() const { return -1; } uint64 Properties() const { return kUnweighted; } bool Compatible(const Fst<A> &fst) const { uint64 props = Properties(); return fst.Properties(props, true) == props; } static const string &Type() { static const string type = "unweighted"; return type; } bool Write(ostream &strm) const { return true; } static UnweightedCompactor *Read(istream &strm) { return new UnweightedCompactor; } }; // Uselful aliases when using StdArc typedef CompactFst< StdArc, StringCompactor<StdArc> > StdCompactStringFst; typedef CompactFst< StdArc, WeightedStringCompactor<StdArc> > StdCompactWeightedStringFst; typedef CompactFst<StdArc, AcceptorCompactor<StdArc> > StdCompactAcceptorFst; typedef CompactFst<StdArc, UnweightedCompactor<StdArc> > StdCompactUnweightedFst; typedef CompactFst<StdArc, UnweightedAcceptorCompactor<StdArc> > StdCompactUnweightedAcceptorFst; } // namespace fst #endif // FST_LIB_COMPACT_FST_H__