From f2b67244d84c5cc2e2391068669ecd175005551c Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Fri, 6 Nov 2020 15:54:31 +0100
Subject: [PATCH 1/6] Tool for lattice rescoring by composing with
 per-utterance FSTs.

---
 src/latbin/Makefile                |   5 +-
 src/latbin/lattice-compose-fsts.cc | 190 +++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100644 src/latbin/lattice-compose-fsts.cc

diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index d5cc4d035b9..592fca41e50 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -26,7 +26,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
            lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse \
-		   lattice-expand lattice-path-cover lattice-add-nnlmscore
+		       lattice-expand lattice-path-cover lattice-add-nnlmscore \
+	         lattice-compose-fsts
 
 OBJFILES =
 
@@ -36,6 +37,6 @@ TESTFILES =
 ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc
new file mode 100644
index 00000000000..1a74094e76e
--- /dev/null
+++ b/src/latbin/lattice-compose-fsts.cc
@@ -0,0 +1,190 @@
+// latbin/lattice-compose-fsts.cc
+
+// Copyright 2020  Brno University of Technology; Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Composes lattices (in transducer form, as type Lattice) with word-network FSTs.\n"
+        "Either with a single FST from rxfilename or with per-utterance FSTs from rspecifier.\n"
+        "The FST weights are interpreted as \"graph weights\" when converted into the Lattice format.\n"
+        "\n"
+        "Usage: lattice-compose-fsts [options] lattice-rspecifier1 "
+        "(fst-rspecifier2|fst-rxfilename2) lattice-wspecifier\n"
+        " e.g.: lattice-compose-fsts ark:1.lats ark:2.fsts ark:composed.lats\n"
+        " or: lattice-compose-fsts ark:1.lats G.fst ark:composed.lats\n";
+
+    ParseOptions po(usage);
+
+    bool write_compact = true;
+    int32 num_states_cache = 50000;
+    int32 phi_label = fst::kNoLabel; // == -1
+    po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
+    po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM");
+    po.Register("num-states-cache", &num_states_cache,
+                "Number of states we cache when mapping LM FST to lattice type. "
+                "More -> more memory but faster.");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    KALDI_ASSERT(phi_label > 0 || phi_label == fst::kNoLabel); // e.g. 0 not allowed.
+
+    std::string lats_rspecifier1 = po.GetArg(1),
+        arg2 = po.GetArg(2),
+        lats_wspecifier = po.GetArg(3);
+    int32 n_done = 0, n_fail = 0;
+
+    SequentialLatticeReader lattice_reader1(lats_rspecifier1);
+
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+
+    if (write_compact)
+      compact_lattice_writer.Open(lats_wspecifier);
+    else
+      lattice_writer.Open(lats_wspecifier);
+
+    if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) {
+      std::string fst_rxfilename = arg2;
+      VectorFst<StdArc>* fst2 = fst::ReadFstKaldi(fst_rxfilename);
+      // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring,
+      // with all the cost on the first member of the pair (since we're
+      // assuming it's a graph weight).
+      if (fst2->Properties(fst::kILabelSorted, true) == 0) {
+        // Make sure fst2 is sorted on ilabel.
+        fst::ILabelCompare<StdArc> ilabel_comp;
+        ArcSort(fst2, ilabel_comp);
+      }
+      if (phi_label > 0)
+        PropagateFinal(phi_label, fst2);
+
+      fst::CacheOptions cache_opts(true, num_states_cache);
+      fst::MapFstOptions mapfst_opts(cache_opts);
+      fst::StdToLatticeMapper<BaseFloat> mapper;
+      fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
+          mapped_fst2(*fst2, mapper, mapfst_opts);
+
+      for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
+        std::string key = lattice_reader1.Key();
+        KALDI_VLOG(1) << "Processing lattice for key " << key;
+        Lattice lat1 = lattice_reader1.Value();
+        ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
+        Lattice composed_lat;
+        if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
+        else Compose(lat1, mapped_fst2, &composed_lat);
+        if (composed_lat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          if (write_compact) {
+            CompactLattice clat;
+            ConvertLattice(composed_lat, &clat);
+            compact_lattice_writer.Write(key, clat);
+          } else {
+            lattice_writer.Write(key, composed_lat);
+          }
+          n_done++;
+        }
+      }
+      delete fst2;
+    } else {
+      // composing with each utterance with different fst,
+      std::string fst_rspecifier2 = arg2;
+      RandomAccessTableReader<fst::VectorFstHolder> fst_reader2(fst_rspecifier2);
+
+      for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
+        std::string key = lattice_reader1.Key();
+        KALDI_VLOG(1) << "Processing lattice for key " << key;
+        Lattice lat1 = lattice_reader1.Value();
+        lattice_reader1.FreeCurrent();
+
+        if (!fst_reader2.HasKey(key)) {
+          KALDI_WARN << "Not producing output for utterance " << key
+                     << " because not present in second table.";
+          n_fail++;
+          continue;
+        }
+
+        VectorFst<StdArc> fst2 = fst_reader2.Value(key);
+        if (fst2.Properties(fst::kILabelSorted, true) == 0) {
+          // Make sure fst2 is sorted on ilabel.
+          fst::ILabelCompare<StdArc> ilabel_comp;
+          fst::ArcSort(&fst2, ilabel_comp);
+        }
+        if (phi_label > 0)
+          PropagateFinal(phi_label, &fst2);
+
+        // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring,
+        // with all the cost on the first member of the pair (since we're
+        // assuming it's a graph weight).
+        fst::CacheOptions cache_opts(true, num_states_cache);
+        fst::MapFstOptions mapfst_opts(cache_opts);
+        fst::StdToLatticeMapper<BaseFloat> mapper;
+        fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
+            mapped_fst2(fst2, mapper, mapfst_opts);
+
+        // sort lat1 on olabel.
+        ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
+
+        Lattice composed_lat;
+        if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
+        else Compose(lat1, mapped_fst2, &composed_lat);
+
+        if (composed_lat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          if (write_compact) {
+            CompactLattice clat;
+            ConvertLattice(composed_lat, &clat);
+            compact_lattice_writer.Write(key, clat);
+          } else {
+            lattice_writer.Write(key, composed_lat);
+          }
+          n_done++;
+        }
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices; failed for "
+              << n_fail;
+
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From ecb0ce76627f970a1745edd48a4dfe1cd7ab37df Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Thu, 17 Dec 2020 19:43:57 +0100
Subject: [PATCH 2/6] nnet3-latgen-faster-compose, online composition of HCLG
 graph with boosting graph

---
 .gitignore                                  |   3 +
 src/latbin/lattice-compose-fsts.cc          |   4 +
 src/nnet3bin/Makefile                       |   5 +-
 src/nnet3bin/nnet3-latgen-faster-compose.cc | 273 ++++++++++++++++++++
 4 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 src/nnet3bin/nnet3-latgen-faster-compose.cc

diff --git a/.gitignore b/.gitignore
index 9f8c727d4d0..d0a03a5c13e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,9 @@ core
 .[#]*
 *~
 
+# vim autosave and backup files.
+*.sw?
+
 # [ecg]tag files.
 TAGS
 tags
diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc
index 1a74094e76e..bbdcc670f01 100644
--- a/src/latbin/lattice-compose-fsts.cc
+++ b/src/latbin/lattice-compose-fsts.cc
@@ -88,8 +88,10 @@ int main(int argc, char *argv[]) {
         fst::ILabelCompare<StdArc> ilabel_comp;
         ArcSort(fst2, ilabel_comp);
       }
+      /* // THIS MAKES ALL STATES FINAL STATES! WHY?
       if (phi_label > 0)
         PropagateFinal(phi_label, fst2);
+      */
 
       fst::CacheOptions cache_opts(true, num_states_cache);
       fst::MapFstOptions mapfst_opts(cache_opts);
@@ -144,8 +146,10 @@ int main(int argc, char *argv[]) {
           fst::ILabelCompare<StdArc> ilabel_comp;
           fst::ArcSort(&fst2, ilabel_comp);
         }
+        /* // THIS MAKES ALL STATES FINAL STATES! WHY?
         if (phi_label > 0)
           PropagateFinal(phi_label, &fst2);
+        */
 
         // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring,
         // with all the cost on the first member of the pair (since we're
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 039fc258b13..2804e4b31fe 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -22,7 +22,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-xvector-compute-batched \
    nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
    nnet3-latgen-faster-lookahead cuda-gpu-available cuda-compiled \
-   nnet3-latgen-faster-looped-parallel
+   nnet3-latgen-faster-looped-parallel \
+   nnet3-latgen-faster-compose
 
 OBJFILES =
 
@@ -37,7 +38,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../ivector/kaldi-ivector.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
new file mode 100644
index 00000000000..ad9f8d38d21
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -0,0 +1,273 @@
+// nnet3bin/nnet3-latgen-faster-compose.cc
+
+// Copyright      2020   Brno University of Technology (author: Karel Vesely)
+//           2012-2015   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "base/timer.h"
+
+#include <fst/compose.h>
+#include <memory>
+
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.  You should use nnet3-latgen-faster-batch if you want to use a GPU.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model, with on-the-fly composition HCLG o B.\n"
+        "B is utterance-specific boosting graph, typically a single-state FST with\n"
+        "all words from words.txt on self loop arcs (then composition is not prohibitevly slow).\n"
+        "Some word-arcs will have score discounts as costs, to boost them in HMM beam-search.\n"
+        "Or, by not including words in B, we can remove them from HCLG network.\n"
+        "Usage: nnet3-latgen-faster-compose [options] <nnet-in> <fst-in> <boost-fsts-rspecifier> <features-rspecifier>"
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n";
+
+    ParseOptions po(usage);
+
+    Timer timer, timer_compose;
+    double elapsed_compose = 0.0;
+
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig config;
+    NnetSimpleComputationOptions decodable_opts;
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    config.Register(&po);
+    decodable_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1),
+        hclg_fst_rxfilename = po.GetArg(2),
+        boosting_fst_rspecifier = po.GetArg(3),
+        feature_rspecifier = po.GetArg(4),
+        lattice_wspecifier = po.GetArg(5),
+        words_wspecifier = po.GetOptArg(6),
+        alignment_wspecifier = po.GetOptArg(7);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    Int32VectorWriter words_writer(words_wspecifier);
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    std::unique_ptr<fst::SymbolTable> word_syms = nullptr;
+    if (word_syms_filename != "") {
+      word_syms.reset(fst::SymbolTable::ReadText(word_syms_filename));
+      if (!word_syms)
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+    }
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+    // this compiler object allows caching of computations across
+    // different utterances.
+    CachingOptimizingCompiler compiler(am_nnet.GetNnet(),
+                                       decodable_opts.optimize_config);
+
+    KALDI_ASSERT(ClassifyRspecifier(hclg_fst_rxfilename, NULL, NULL) == kNoRspecifier);
+    {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+      RandomAccessTableReader<fst::VectorFstHolder> boosting_fst_reader(boosting_fst_rspecifier);
+
+      // HCLG FST is just one FST, not a table of FSTs.
+      auto hclg_fst = std::unique_ptr<VectorFst<StdArc>>(fst::ReadFstKaldi(hclg_fst_rxfilename));
+
+      // make sure hclg is sorted on olabel
+      if (hclg_fst->Properties(fst::kOLabelSorted, true) == 0) {
+        fst::OLabelCompare<StdArc> olabel_comp;
+        fst::ArcSort(hclg_fst.get(), olabel_comp);
+      }
+
+      timer.Reset();
+
+      {
+
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          const Matrix<BaseFloat> &features (feature_reader.Value());
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+          const Matrix<BaseFloat> *online_ivectors = NULL;
+          const Vector<BaseFloat> *ivector = NULL;
+          if (!ivector_rspecifier.empty()) {
+            if (!ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              ivector = &ivector_reader.Value(utt);
+            }
+          }
+          if (!online_ivector_rspecifier.empty()) {
+            if (!online_ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No online iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              online_ivectors = &online_ivector_reader.Value(utt);
+            }
+          }
+
+          // get the boosting graph,
+          VectorFst<StdArc> boosting_fst;
+          if (!boosting_fst_reader.HasKey(utt)) {
+            KALDI_WARN << "No boosting fst for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            boosting_fst = boosting_fst_reader.Value(utt); // copy,
+          }
+
+          timer_compose.Reset();
+
+          // make sure boosting graph is sorted on ilabel,
+          if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) {
+            fst::ILabelCompare<StdArc> ilabel_comp;
+            fst::ArcSort(&boosting_fst, ilabel_comp);
+          }
+
+          // TODO: should we call rmepsilon on boosting_fst ?
+
+          // run composition (measure time),
+          VectorFst<StdArc> decode_fst;
+          fst::Compose(*hclg_fst, boosting_fst, &decode_fst);
+
+          // TODO: should we sort the 'decode_fst' by isymbols ?
+          //       (we don't do it, as it would take time.
+          //        not sure it decoding would be faster if
+          //        decode_fst was sorted by isymbols)
+
+          elapsed_compose += timer_compose.Elapsed();
+
+          DecodableAmNnetSimple nnet_decodable(
+              decodable_opts, trans_model, am_nnet,
+              features, ivector, online_ivectors,
+              online_ivector_period, &compiler);
+
+          LatticeFasterDecoder decoder(decode_fst, config);
+
+          double like;
+          if (DecodeUtteranceLatticeFaster(
+                  decoder, nnet_decodable, trans_model, word_syms.get(), utt,
+                  decodable_opts.acoustic_scale, determinize, allow_partial,
+                  &alignment_writer, &words_writer, &compact_lattice_writer,
+                  &lattice_writer,
+                  &like)) {
+            tot_like += like;
+            frame_count += nnet_decodable.NumFramesReady();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+    }
+
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed * 100.0 / input_frame_count);
+    KALDI_LOG << "Composition time "<< elapsed_compose
+              << "s (" << (elapsed_compose * 100.0 / elapsed) << "%)";
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count << " frames.";
+
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From d87ec232edb0faccdb437108e7f984affc750484 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 4 Jan 2021 16:21:49 +0100
Subject: [PATCH 3/6] adding script for on-the-fly boosting of HCLG graph

---
 egs/wsj/s5/steps/nnet3/decode_compose.sh    | 181 ++++++++++++++++++++
 src/nnet3bin/nnet3-latgen-faster-compose.cc |   8 +
 2 files changed, 189 insertions(+)
 create mode 100755 egs/wsj/s5/steps/nnet3/decode_compose.sh

diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh
new file mode 100755
index 00000000000..9f4e8bef020
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh
@@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+
+# Copyright 2021       Brno University of Technology (Author: Karel Vesely).
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.
+# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting
+# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing).
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+
+boosting_graphs=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+if [ -f $srcdir/cmvn_opts ]; then
+    cmvn_opts=`cat $srcdir/cmvn_opts`
+else
+    cmvn_opts="--norm-means=false --norm-vars=false"
+fi
+
+#thread_string=
+#if $use_gpu; then
+#  if [ $num_threads -eq 1 ]; then
+#    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+#  fi
+#  thread_string="-batch --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads --gpu 1"
+#elif [ $num_threads -gt 1 ]; then
+#  thread_string="-parallel --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads"
+#fi
+queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading...
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+## Set up features.
+if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
+else online_cmvn=false; fi
+
+if ! $online_cmvn; then
+  echo "$0: feature type is raw"
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+else
+  echo "$0: feature type is raw (apply-cmvn-online)"
+  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+elif [ -f $srcdir/init/info.txt ]; then
+    frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
+    if [ ! -z $frame_subsampling_factor ]; then
+        frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+    fi
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
index ad9f8d38d21..e560a8a15af 100644
--- a/src/nnet3bin/nnet3-latgen-faster-compose.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -225,6 +225,14 @@ int main(int argc, char *argv[]) {
           //        not sure it decoding would be faster if
           //        decode_fst was sorted by isymbols)
 
+          // Check that composed graph is non-empty,
+          if (decode_fst.Start() == fst::kNoStateId) {
+            KALDI_WARN << "Empty 'decode_fst' HCLG for utterance "
+                       << utt << " (bad boosting graph?)";
+            num_fail++;
+            continue;
+          }
+
           elapsed_compose += timer_compose.Elapsed();
 
           DecodableAmNnetSimple nnet_decodable(

From f9ae938499190fc6f9e3ea0f454bce455969b0d1 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 5 Jan 2021 19:00:37 +0100
Subject: [PATCH 4/6] updating the HCLG boosting code, debugging

---
 egs/wsj/s5/steps/nnet3/decode_compose.sh    |   4 +-
 src/nnet3bin/nnet3-latgen-faster-compose.cc | 172 +++++++++++---------
 2 files changed, 95 insertions(+), 81 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh
index 9f4e8bef020..8d003b60eac 100755
--- a/egs/wsj/s5/steps/nnet3/decode_compose.sh
+++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh
@@ -22,8 +22,8 @@ min_active=200
 ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
-num_threads=1 # if >1, will use gmm-latgen-faster-parallel
-use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+#num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
               # In that case it is recommended to set num-threads to a large
               # number, e.g. 20 if you have that many free CPU slots on a GPU
               # node, and to use a small number of jobs.
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
index e560a8a15af..ad230540584 100644
--- a/src/nnet3bin/nnet3-latgen-faster-compose.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -31,6 +31,7 @@
 #include "base/timer.h"
 
 #include <fst/compose.h>
+#include <fst/rmepsilon.h>
 #include <memory>
 
 
@@ -154,106 +155,119 @@ int main(int argc, char *argv[]) {
 
       RandomAccessTableReader<fst::VectorFstHolder> boosting_fst_reader(boosting_fst_rspecifier);
 
-      // HCLG FST is just one FST, not a table of FSTs.
-      auto hclg_fst = std::unique_ptr<VectorFst<StdArc>>(fst::ReadFstKaldi(hclg_fst_rxfilename));
+      // 'hclg_fst' is a single FST.
+      VectorFst<StdArc> hclg_fst;
+      {
+        auto hclg_fst_tmp = std::unique_ptr<Fst<StdArc>>(fst::ReadFstKaldiGeneric(hclg_fst_rxfilename));
+        hclg_fst = VectorFst<StdArc>(*hclg_fst_tmp); // Fst -> VectorFst, as it has to be MutableFst...
+        // 'hclg_fst_tmp' is deleted by 'going out of scope' ...
+      }
 
       // make sure hclg is sorted on olabel
-      if (hclg_fst->Properties(fst::kOLabelSorted, true) == 0) {
+      if (hclg_fst.Properties(fst::kOLabelSorted, true) == 0) {
         fst::OLabelCompare<StdArc> olabel_comp;
-        fst::ArcSort(hclg_fst.get(), olabel_comp);
+        fst::ArcSort(&hclg_fst, olabel_comp);
       }
 
       timer.Reset();
 
-      {
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features (feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
+      //// MAIN LOOP ////
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        std::string utt = feature_reader.Key();
+        const Matrix<BaseFloat> &features (feature_reader.Value());
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
             num_fail++;
             continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
           }
-          const Matrix<BaseFloat> *online_ivectors = NULL;
-          const Vector<BaseFloat> *ivector = NULL;
-          if (!ivector_rspecifier.empty()) {
-            if (!ivector_reader.HasKey(utt)) {
-              KALDI_WARN << "No iVector available for utterance " << utt;
-              num_fail++;
-              continue;
-            } else {
-              ivector = &ivector_reader.Value(utt);
-            }
-          }
-          if (!online_ivector_rspecifier.empty()) {
-            if (!online_ivector_reader.HasKey(utt)) {
-              KALDI_WARN << "No online iVector available for utterance " << utt;
-              num_fail++;
-              continue;
-            } else {
-              online_ivectors = &online_ivector_reader.Value(utt);
-            }
-          }
-
-          // get the boosting graph,
-          VectorFst<StdArc> boosting_fst;
-          if (!boosting_fst_reader.HasKey(utt)) {
-            KALDI_WARN << "No boosting fst for utterance " << utt;
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
             num_fail++;
             continue;
           } else {
-            boosting_fst = boosting_fst_reader.Value(utt); // copy,
+            online_ivectors = &online_ivector_reader.Value(utt);
           }
+        }
 
-          timer_compose.Reset();
-
-          // make sure boosting graph is sorted on ilabel,
-          if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) {
-            fst::ILabelCompare<StdArc> ilabel_comp;
-            fst::ArcSort(&boosting_fst, ilabel_comp);
-          }
+        // get the boosting graph,
+        VectorFst<StdArc> boosting_fst;
+        if (!boosting_fst_reader.HasKey(utt)) {
+          KALDI_WARN << "No boosting fst for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          boosting_fst = boosting_fst_reader.Value(utt); // copy,
+        }
 
-          // TODO: should we call rmepsilon on boosting_fst ?
+        timer_compose.Reset();
 
-          // run composition (measure time),
-          VectorFst<StdArc> decode_fst;
-          fst::Compose(*hclg_fst, boosting_fst, &decode_fst);
+        // RmEpsilon saved 30% of composition runtime...
+        // - Note: we are loading 2-state graphs with eps back-link to the initial state.
+        if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) {
+          fst::RmEpsilon(&boosting_fst);
+        }
 
-          // TODO: should we sort the 'decode_fst' by isymbols ?
-          //       (we don't do it, as it would take time.
-          //        not sure it decoding would be faster if
-          //        decode_fst was sorted by isymbols)
+        // make sure boosting graph is sorted on ilabel,
+        if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) {
+          fst::ILabelCompare<StdArc> ilabel_comp;
+          fst::ArcSort(&boosting_fst, ilabel_comp);
+        }
 
-          // Check that composed graph is non-empty,
-          if (decode_fst.Start() == fst::kNoStateId) {
-            KALDI_WARN << "Empty 'decode_fst' HCLG for utterance "
-                       << utt << " (bad boosting graph?)";
-            num_fail++;
-            continue;
-          }
+        // run composition,
+        VectorFst<StdArc> decode_fst;
+        fst::Compose(hclg_fst, boosting_fst, &decode_fst);
 
-          elapsed_compose += timer_compose.Elapsed();
-
-          DecodableAmNnetSimple nnet_decodable(
-              decodable_opts, trans_model, am_nnet,
-              features, ivector, online_ivectors,
-              online_ivector_period, &compiler);
-
-          LatticeFasterDecoder decoder(decode_fst, config);
-
-          double like;
-          if (DecodeUtteranceLatticeFaster(
-                  decoder, nnet_decodable, trans_model, word_syms.get(), utt,
-                  decodable_opts.acoustic_scale, determinize, allow_partial,
-                  &alignment_writer, &words_writer, &compact_lattice_writer,
-                  &lattice_writer,
-                  &like)) {
-            tot_like += like;
-            frame_count += nnet_decodable.NumFramesReady();
-            num_success++;
-          } else num_fail++;
+        // check that composed graph is non-empty,
+        if (decode_fst.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty 'decode_fst' HCLG for utterance "
+                     << utt << " (bad boosting graph?)";
+          num_fail++;
+          continue;
         }
+
+        elapsed_compose += timer_compose.Elapsed();
+
+        DecodableAmNnetSimple nnet_decodable(
+            decodable_opts, trans_model, am_nnet,
+            features, ivector, online_ivectors,
+            online_ivector_period, &compiler);
+
+        // Note: decode_fst is VectorFst, not ConstFst.
+        //
+        //       OpenFst docs say that more specific iterators
+        //       are faster than generic iterators. And in HCLG
+        //       is usually loaded for decoding as ConstFst.
+        //
+        //       auto decode_fst_ = ConstFst<StdArc>(decode_fst);
+        //
+        //       In this way, I tried to cast VectorFst to ConstFst,
+        //       but this made the decoding 20% slower.
+        //
+        LatticeFasterDecoder decoder(decode_fst, config);
+
+        double like;
+        if (DecodeUtteranceLatticeFaster(
+                decoder, nnet_decodable, trans_model, word_syms.get(), utt,
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer,
+                &like)) {
+          tot_like += like;
+          frame_count += nnet_decodable.NumFramesReady();
+          num_success++;
+        } else num_fail++;
       }
     }
 

From 0e31ffa77782a8f704bf162f43a917955b13d26c Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 21 Jun 2021 16:20:55 +0200
Subject: [PATCH 5/6] Apply suggestions from code review

Adding suggestions from code review.

Co-authored-by: Cy 'kkm' Katsnelson <kkm@pobox.com>
---
 egs/wsj/s5/steps/nnet3/decode_compose.sh    |  4 +--
 src/latbin/lattice-compose-fsts.cc          | 28 ++++++++++-------
 src/nnet3bin/nnet3-latgen-faster-compose.cc | 35 +++++++++++----------
 3 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh
index 8d003b60eac..5e3b80adf09 100755
--- a/egs/wsj/s5/steps/nnet3/decode_compose.sh
+++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh
@@ -48,7 +48,7 @@ echo "$0 $@"  # Print the command line for logging
 if [ $# -ne 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
-  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "      --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
   echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
@@ -67,7 +67,7 @@ fi
 graphdir=$1
 data=$2
 dir=$3
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+srcdir=$(dirname $dir)  # Assume model directory one level up from decoding directory.
 model=$srcdir/$iter.mdl
 
 [ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1
diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc
index bbdcc670f01..8fd889e192f 100644
--- a/src/latbin/lattice-compose-fsts.cc
+++ b/src/latbin/lattice-compose-fsts.cc
@@ -47,12 +47,14 @@ int main(int argc, char *argv[]) {
 
     bool write_compact = true;
     int32 num_states_cache = 50000;
-    int32 phi_label = fst::kNoLabel; // == -1
-    po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
-    po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM");
+    int32 phi_label = fst::kNoLabel;  // == -1
+    po.Register("write-compact", &write_compact,
+                "If true, write in normal (compact) form.");
+    po.Register("phi-label", &phi_label,
+                "If >0, the label on backoff arcs of the LM");
     po.Register("num-states-cache", &num_states_cache,
-                "Number of states we cache when mapping LM FST to lattice type. "
-                "More -> more memory but faster.");
+                "Number of states we cache when mapping LM FST to lattice type."
+                " More -> more memory but faster.");
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -72,10 +74,11 @@ int main(int argc, char *argv[]) {
     CompactLatticeWriter compact_lattice_writer;
     LatticeWriter lattice_writer;
 
-    if (write_compact)
+    if (write_compact) {
       compact_lattice_writer.Open(lats_wspecifier);
-    else
+    } else {
       lattice_writer.Open(lats_wspecifier);
+    }
 
     if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) {
       std::string fst_rxfilename = arg2;
@@ -105,8 +108,11 @@ int main(int argc, char *argv[]) {
         Lattice lat1 = lattice_reader1.Value();
         ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
         Lattice composed_lat;
-        if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
-        else Compose(lat1, mapped_fst2, &composed_lat);
+        if (phi_label > 0) {
+          PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat);
+        } else {
+          Compose(lat1, mapped_fst2, &composed_lat);
+        }
         if (composed_lat.Start() == fst::kNoStateId) {
           KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
           n_fail++;
@@ -123,7 +129,7 @@ int main(int argc, char *argv[]) {
       }
       delete fst2;
     } else {
-      // composing with each utterance with different fst,
+      // Compose each utterance with its matching (by key) FST.
       std::string fst_rspecifier2 = arg2;
       RandomAccessTableReader<fst::VectorFstHolder> fst_reader2(fst_rspecifier2);
 
@@ -135,7 +141,7 @@ int main(int argc, char *argv[]) {
 
         if (!fst_reader2.HasKey(key)) {
           KALDI_WARN << "Not producing output for utterance " << key
-                     << " because not present in second table.";
+                     << " because it's not present in second table.";
           n_fail++;
           continue;
         }
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
index ad230540584..7d9a5081dc5 100644
--- a/src/nnet3bin/nnet3-latgen-faster-compose.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -20,19 +20,20 @@
 // limitations under the License.
 
 
+#include <fst/compose.h>
+#include <fst/rmepsilon.h>
+
+#include <memory>
+
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
+#include "base/timer.h"
 #include "decoder/decoder-wrappers.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
 #include "nnet3/nnet-am-decodable-simple.h"
 #include "nnet3/nnet-utils.h"
-#include "base/timer.h"
-
-#include <fst/compose.h>
-#include <fst/rmepsilon.h>
-#include <memory>
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -121,9 +122,10 @@ int main(int argc, char *argv[]) {
     CompactLatticeWriter compact_lattice_writer;
     LatticeWriter lattice_writer;
     if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
+           : lattice_writer.Open(lattice_wspecifier))) {
       KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
+                << lattice_wspecifier;
+    }                 
 
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
@@ -213,13 +215,13 @@ int main(int argc, char *argv[]) {
 
         timer_compose.Reset();
 
-        // RmEpsilon saved 30% of composition runtime...
+        // RmEpsilon saved 30% of composition runtime.
         // - Note: we are loading 2-state graphs with eps back-link to the initial state.
         if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) {
           fst::RmEpsilon(&boosting_fst);
         }
 
-        // make sure boosting graph is sorted on ilabel,
+        // Make sure boosting graph is sorted on ilabel.
         if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) {
           fst::ILabelCompare<StdArc> ilabel_comp;
           fst::ArcSort(&boosting_fst, ilabel_comp);
@@ -267,7 +269,9 @@ int main(int argc, char *argv[]) {
           tot_like += like;
           frame_count += nnet_decodable.NumFramesReady();
           num_success++;
-        } else num_fail++;
+        } else {
+          ++num_fail;
+        }
       }
     }
 
@@ -286,8 +290,7 @@ int main(int argc, char *argv[]) {
               << (tot_like / frame_count) << " over "
               << frame_count << " frames.";
 
-    if (num_success != 0) return 0;
-    else return 1;
+    return num_success != 0 ? 0 : 1;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;

From 634a7159aa7e8e5e0815f1c34e1b1954b9b5aabd Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 31 Jan 2022 18:22:04 +0100
Subject: [PATCH 6/6] nnet3-latgen-faster-compose, add RhoCompose and
 nnet3/decode_compose_rho.sh script

---
 egs/wsj/s5/steps/nnet3/decode_compose_rho.sh | 184 +++++++++++++++++++
 src/configure_mkl                            |  21 +++
 src/nnet3bin/nnet3-latgen-faster-compose.cc  |  14 +-
 3 files changed, 217 insertions(+), 2 deletions(-)
 create mode 100755 egs/wsj/s5/steps/nnet3/decode_compose_rho.sh
 create mode 100755 src/configure_mkl

diff --git a/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh
new file mode 100755
index 00000000000..362bec260ca
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+# Copyright 2021       Brno University of Technology (Author: Karel Vesely).
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.
+# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting
+# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing).
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+#num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+
+boosting_graphs=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "      --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=$(dirname $dir)  # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+if [ -f $srcdir/cmvn_opts ]; then
+    cmvn_opts=`cat $srcdir/cmvn_opts`
+else
+    cmvn_opts="--norm-means=false --norm-vars=false"
+fi
+
+#thread_string=
+#if $use_gpu; then
+#  if [ $num_threads -eq 1 ]; then
+#    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+#  fi
+#  thread_string="-batch --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads --gpu 1"
+#elif [ $num_threads -gt 1 ]; then
+#  thread_string="-parallel --num-threads=$num_threads"
+#  queue_opt="--num-threads $num_threads"
+#fi
+queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading...
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+## Set up features.
+if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
+else online_cmvn=false; fi
+
+if ! $online_cmvn; then
+  echo "$0: feature type is raw"
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+else
+  echo "$0: feature type is raw (apply-cmvn-online)"
+  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+elif [ -f $srcdir/init/info.txt ]; then
+    frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
+    if [ ! -z $frame_subsampling_factor ]; then
+        frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+    fi
+fi
+
+rho_label=$(grep '#0' $graphdir/words.txt | awk '{ print $2; }')
+
+if [ $stage -le 1 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \
+     --rho-label=$rho_label \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/src/configure_mkl b/src/configure_mkl
new file mode 100755
index 00000000000..0f2936471f7
--- /dev/null
+++ b/src/configure_mkl
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Get the MKL config,
+#source /usr/local/share/intel/mkl/bin/mklvars.sh intel64 ilp64 # outdated...
+export MKLROOT=/usr/local/share/intel/mkl/2021.4.0
+
+# Use older compiler
+# export CXX=g++-7.4 # CUDA 10.0 works well with version '7.4',
+export CXX=g++-9.4   # CUDA 11.2, recommended gcc version 9.*,
+
+export CXXFLAGS="-march=x86-64" # compile for 'generic' 64bit CPU,
+#export CXXFLAGS="-march=westmere" # oldest architecutre we have at BUT (X5675, Westmere, blade024),
+
+# Use different CUDA,
+# CUDATK=/usr/local/share/cuda-10.2.89 # CUDA 10.0 supports our default gcc 7.4.0,
+# CUDATK=/usr/local/share/cuda-11.0.194
+CUDATK=/usr/local/share/cuda-11.2
+# and add '--cudatk-dir=$CUDATK' to './configure'
+
+# Generate kaldi.mk,
+./configure --mkl-root=$MKLROOT --cudatk-dir=$CUDATK --shared
diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc
index 7d9a5081dc5..d4e7c094d30 100644
--- a/src/nnet3bin/nnet3-latgen-faster-compose.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc
@@ -64,6 +64,8 @@ int main(int argc, char *argv[]) {
     Timer timer, timer_compose;
     double elapsed_compose = 0.0;
 
+    int32 rho_label = fst::kNoLabel; // == -1
+
     bool allow_partial = false;
     LatticeFasterDecoderConfig config;
     NnetSimpleComputationOptions decodable_opts;
@@ -75,6 +77,10 @@ int main(int argc, char *argv[]) {
     int32 online_ivector_period = 0;
     config.Register(&po);
     decodable_opts.Register(&po);
+
+    po.Register("rho-label", &rho_label,
+                "If >0, symbol for 'match the rest' in the biasing graph boosting_fst");
+
     po.Register("word-symbol-table", &word_syms_filename,
                 "Symbol table for words [for debug output]");
     po.Register("allow-partial", &allow_partial,
@@ -125,7 +131,7 @@ int main(int argc, char *argv[]) {
            : lattice_writer.Open(lattice_wspecifier))) {
       KALDI_ERR << "Could not open table for writing lattices: "
                 << lattice_wspecifier;
-    }                 
+    }
 
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
@@ -229,7 +235,11 @@ int main(int argc, char *argv[]) {
 
         // run composition,
         VectorFst<StdArc> decode_fst;
-        fst::Compose(hclg_fst, boosting_fst, &decode_fst);
+        if (rho_label > 0) {
+          fst::RhoCompose(hclg_fst, boosting_fst, rho_label, &decode_fst);
+        } else {
+          fst::Compose(hclg_fst, boosting_fst, &decode_fst);
+        }
 
         // check that composed graph is non-empty,
         if (decode_fst.Start() == fst::kNoStateId) {