From f2b67244d84c5cc2e2391068669ecd175005551c Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Fri, 6 Nov 2020 15:54:31 +0100 Subject: [PATCH 1/6] Tool for lattice rescoring by composing with per-utterance FSTs. --- src/latbin/Makefile | 5 +- src/latbin/lattice-compose-fsts.cc | 190 +++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 src/latbin/lattice-compose-fsts.cc diff --git a/src/latbin/Makefile b/src/latbin/Makefile index d5cc4d035b9..592fca41e50 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -26,7 +26,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \ lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse \ - lattice-expand lattice-path-cover lattice-add-nnlmscore + lattice-expand lattice-path-cover lattice-add-nnlmscore \ + lattice-compose-fsts OBJFILES = @@ -36,6 +37,6 @@ TESTFILES = ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \ ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ - ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc new file mode 100644 index 00000000000..1a74094e76e --- /dev/null +++ b/src/latbin/lattice-compose-fsts.cc @@ -0,0 +1,190 @@ +// latbin/lattice-compose-fsts.cc + +// Copyright 2020 Brno University of Technology; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Composes lattices (in transducer form, as type Lattice) with word-network FSTs.\n" + "Either with a single FST from rxfilename or with per-utterance FSTs from rspecifier.\n" + "The FST weights are interpreted as \"graph weights\" when converted into the Lattice format.\n" + "\n" + "Usage: lattice-compose-fsts [options] lattice-rspecifier1 " + "(fst-rspecifier2|fst-rxfilename2) lattice-wspecifier\n" + " e.g.: lattice-compose-fsts ark:1.lats ark:2.fsts ark:composed.lats\n" + " or: lattice-compose-fsts ark:1.lats G.fst ark:composed.lats\n"; + + ParseOptions po(usage); + + bool write_compact = true; + int32 num_states_cache = 50000; + int32 phi_label = fst::kNoLabel; // == -1 + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); + po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM"); + po.Register("num-states-cache", &num_states_cache, + "Number of states we cache when mapping LM FST to lattice type. " + "More -> more memory but faster."); + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + KALDI_ASSERT(phi_label > 0 || phi_label == fst::kNoLabel); // e.g. 0 not allowed. + + std::string lats_rspecifier1 = po.GetArg(1), + arg2 = po.GetArg(2), + lats_wspecifier = po.GetArg(3); + int32 n_done = 0, n_fail = 0; + + SequentialLatticeReader lattice_reader1(lats_rspecifier1); + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); + + if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) { + std::string fst_rxfilename = arg2; + VectorFst* fst2 = fst::ReadFstKaldi(fst_rxfilename); + // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring, + // with all the cost on the first member of the pair (since we're + // assuming it's a graph weight). + if (fst2->Properties(fst::kILabelSorted, true) == 0) { + // Make sure fst2 is sorted on ilabel. + fst::ILabelCompare ilabel_comp; + ArcSort(fst2, ilabel_comp); + } + if (phi_label > 0) + PropagateFinal(phi_label, fst2); + + fst::CacheOptions cache_opts(true, num_states_cache); + fst::MapFstOptions mapfst_opts(cache_opts); + fst::StdToLatticeMapper mapper; + fst::MapFst > + mapped_fst2(*fst2, mapper, mapfst_opts); + + for (; !lattice_reader1.Done(); lattice_reader1.Next()) { + std::string key = lattice_reader1.Key(); + KALDI_VLOG(1) << "Processing lattice for key " << key; + Lattice lat1 = lattice_reader1.Value(); + ArcSort(&lat1, fst::OLabelCompare()); + Lattice composed_lat; + if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); + else Compose(lat1, mapped_fst2, &composed_lat); + if (composed_lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + n_fail++; + } else { + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + lattice_writer.Write(key, composed_lat); + } + n_done++; + } + } + delete fst2; + } else { + // composing with each utterance with different fst, + std::string fst_rspecifier2 = arg2; + RandomAccessTableReader fst_reader2(fst_rspecifier2); + + for (; !lattice_reader1.Done(); lattice_reader1.Next()) { + std::string key = lattice_reader1.Key(); + KALDI_VLOG(1) << "Processing lattice for key " << key; + Lattice lat1 = lattice_reader1.Value(); + lattice_reader1.FreeCurrent(); + + if (!fst_reader2.HasKey(key)) { + KALDI_WARN << "Not producing output for utterance " << key + << " because not present in second table."; + n_fail++; + continue; + } + + VectorFst fst2 = fst_reader2.Value(key); + if (fst2.Properties(fst::kILabelSorted, true) == 0) { + // Make sure fst2 is sorted on ilabel. + fst::ILabelCompare ilabel_comp; + fst::ArcSort(&fst2, ilabel_comp); + } + if (phi_label > 0) + PropagateFinal(phi_label, &fst2); + + // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring, + // with all the cost on the first member of the pair (since we're + // assuming it's a graph weight). + fst::CacheOptions cache_opts(true, num_states_cache); + fst::MapFstOptions mapfst_opts(cache_opts); + fst::StdToLatticeMapper mapper; + fst::MapFst > + mapped_fst2(fst2, mapper, mapfst_opts); + + // sort lat1 on olabel. + ArcSort(&lat1, fst::OLabelCompare()); + + Lattice composed_lat; + if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); + else Compose(lat1, mapped_fst2, &composed_lat); + + if (composed_lat.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; + n_fail++; + } else { + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + lattice_writer.Write(key, composed_lat); + } + n_done++; + } + } + } + + KALDI_LOG << "Done " << n_done << " lattices; failed for " + << n_fail; + + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} From ecb0ce76627f970a1745edd48a4dfe1cd7ab37df Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Thu, 17 Dec 2020 19:43:57 +0100 Subject: [PATCH 2/6] nnet3-latgen-faster-compose, online composition of HCLG graph with boosting graph --- .gitignore | 3 + src/latbin/lattice-compose-fsts.cc | 4 + src/nnet3bin/Makefile | 5 +- src/nnet3bin/nnet3-latgen-faster-compose.cc | 273 ++++++++++++++++++++ 4 files changed, 283 insertions(+), 2 deletions(-) create mode 100644 src/nnet3bin/nnet3-latgen-faster-compose.cc diff --git a/.gitignore b/.gitignore index 9f8c727d4d0..d0a03a5c13e 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ core .[#]* *~ +# vim autosave and backup files. +*.sw? + # [ecg]tag files. TAGS tags diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc index 1a74094e76e..bbdcc670f01 100644 --- a/src/latbin/lattice-compose-fsts.cc +++ b/src/latbin/lattice-compose-fsts.cc @@ -88,8 +88,10 @@ int main(int argc, char *argv[]) { fst::ILabelCompare ilabel_comp; ArcSort(fst2, ilabel_comp); } + /* // THIS MAKES ALL STATES FINAL STATES! WHY? if (phi_label > 0) PropagateFinal(phi_label, fst2); + */ fst::CacheOptions cache_opts(true, num_states_cache); fst::MapFstOptions mapfst_opts(cache_opts); @@ -144,8 +146,10 @@ int main(int argc, char *argv[]) { fst::ILabelCompare ilabel_comp; fst::ArcSort(&fst2, ilabel_comp); } + /* // THIS MAKES ALL STATES FINAL STATES! WHY? if (phi_label > 0) PropagateFinal(phi_label, &fst2); + */ // mapped_fst2 is fst2 interpreted using the LatticeWeight semiring, // with all the cost on the first member of the pair (since we're diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 039fc258b13..2804e4b31fe 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -22,7 +22,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-xvector-compute-batched \ nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \ nnet3-latgen-faster-lookahead cuda-gpu-available cuda-compiled \ - nnet3-latgen-faster-looped-parallel + nnet3-latgen-faster-looped-parallel \ + nnet3-latgen-faster-compose OBJFILES = @@ -37,7 +38,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \ ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \ ../transform/kaldi-transform.a ../ivector/kaldi-ivector.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc new file mode 100644 index 00000000000..ad9f8d38d21 --- /dev/null +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -0,0 +1,273 @@ +// nnet3bin/nnet3-latgen-faster-compose.cc + +// Copyright 2020 Brno University of Technology (author: Karel Vesely) +// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Guoguo Chen + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "tree/context-dep.h" +#include "hmm/transition-model.h" +#include "fstext/fstext-lib.h" +#include "decoder/decoder-wrappers.h" +#include "nnet3/nnet-am-decodable-simple.h" +#include "nnet3/nnet-utils.h" +#include "base/timer.h" + +#include +#include + + +int main(int argc, char *argv[]) { + // note: making this program work with GPUs is as simple as initializing the + // device, but it probably won't make a huge difference in speed for typical + // setups. You should use nnet3-latgen-faster-batch if you want to use a GPU. + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + using fst::SymbolTable; + using fst::Fst; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Generate lattices using nnet3 neural net model, with on-the-fly composition HCLG o B.\n" + "B is utterance-specific boosting graph, typically a single-state FST with\n" + "all words from words.txt on self loop arcs (then composition is not prohibitevly slow).\n" + "Some word-arcs will have score discounts as costs, to boost them in HMM beam-search.\n" + "Or, by not including words in B, we can remove them from HCLG network.\n" + "Usage: nnet3-latgen-faster-compose [options] " + " [ [] ]\n" + "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n"; + + ParseOptions po(usage); + + Timer timer, timer_compose; + double elapsed_compose = 0.0; + + bool allow_partial = false; + LatticeFasterDecoderConfig config; + NnetSimpleComputationOptions decodable_opts; + + std::string word_syms_filename; + std::string ivector_rspecifier, + online_ivector_rspecifier, + utt2spk_rspecifier; + int32 online_ivector_period = 0; + config.Register(&po); + decodable_opts.Register(&po); + po.Register("word-symbol-table", &word_syms_filename, + "Symbol table for words [for debug output]"); + po.Register("allow-partial", &allow_partial, + "If true, produce output even if end state was not reached."); + po.Register("ivectors", &ivector_rspecifier, "Rspecifier for " + "iVectors as vectors (i.e. not estimated online); per utterance " + "by default, or per speaker if you provide the --utt2spk option."); + po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for " + "utt2spk option used to get ivectors per speaker"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for " + "iVectors estimated online, as matrices. If you supply this," + " you must set the --online-ivector-period option."); + po.Register("online-ivector-period", &online_ivector_period, "Number of frames " + "between iVectors in matrices supplied to the --online-ivectors " + "option"); + + po.Read(argc, argv); + + if (po.NumArgs() < 4 || po.NumArgs() > 6) { + po.PrintUsage(); + exit(1); + } + + std::string model_in_filename = po.GetArg(1), + hclg_fst_rxfilename = po.GetArg(2), + boosting_fst_rspecifier = po.GetArg(3), + feature_rspecifier = po.GetArg(4), + lattice_wspecifier = po.GetArg(5), + words_wspecifier = po.GetOptArg(6), + alignment_wspecifier = po.GetOptArg(7); + + TransitionModel trans_model; + AmNnetSimple am_nnet; + { + bool binary; + Input ki(model_in_filename, &binary); + trans_model.Read(ki.Stream(), binary); + am_nnet.Read(ki.Stream(), binary); + SetBatchnormTestMode(true, &(am_nnet.GetNnet())); + SetDropoutTestMode(true, &(am_nnet.GetNnet())); + CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet())); + } + + bool determinize = config.determinize_lattice; + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) + : lattice_writer.Open(lattice_wspecifier))) + KALDI_ERR << "Could not open table for writing lattices: " + << lattice_wspecifier; + + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReaderMapped ivector_reader( + ivector_rspecifier, utt2spk_rspecifier); + + Int32VectorWriter words_writer(words_wspecifier); + Int32VectorWriter alignment_writer(alignment_wspecifier); + + std::unique_ptr word_syms = nullptr; + if (word_syms_filename != "") { + word_syms.reset(fst::SymbolTable::ReadText(word_syms_filename)); + if (!word_syms) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + } + + double tot_like = 0.0; + kaldi::int64 frame_count = 0; + int num_success = 0, num_fail = 0; + // this compiler object allows caching of computations across + // different utterances. + CachingOptimizingCompiler compiler(am_nnet.GetNnet(), + decodable_opts.optimize_config); + + KALDI_ASSERT(ClassifyRspecifier(hclg_fst_rxfilename, NULL, NULL) == kNoRspecifier); + { + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + + RandomAccessTableReader boosting_fst_reader(boosting_fst_rspecifier); + + // HCLG FST is just one FST, not a table of FSTs. + auto hclg_fst = std::unique_ptr>(fst::ReadFstKaldi(hclg_fst_rxfilename)); + + // make sure hclg is sorted on olabel + if (hclg_fst->Properties(fst::kOLabelSorted, true) == 0) { + fst::OLabelCompare olabel_comp; + fst::ArcSort(hclg_fst.get(), olabel_comp); + } + + timer.Reset(); + + { + + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + const Matrix &features (feature_reader.Value()); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_fail++; + continue; + } + const Matrix *online_ivectors = NULL; + const Vector *ivector = NULL; + if (!ivector_rspecifier.empty()) { + if (!ivector_reader.HasKey(utt)) { + KALDI_WARN << "No iVector available for utterance " << utt; + num_fail++; + continue; + } else { + ivector = &ivector_reader.Value(utt); + } + } + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(utt)) { + KALDI_WARN << "No online iVector available for utterance " << utt; + num_fail++; + continue; + } else { + online_ivectors = &online_ivector_reader.Value(utt); + } + } + + // get the boosting graph, + VectorFst boosting_fst; + if (!boosting_fst_reader.HasKey(utt)) { + KALDI_WARN << "No boosting fst for utterance " << utt; + num_fail++; + continue; + } else { + boosting_fst = boosting_fst_reader.Value(utt); // copy, + } + + timer_compose.Reset(); + + // make sure boosting graph is sorted on ilabel, + if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) { + fst::ILabelCompare ilabel_comp; + fst::ArcSort(&boosting_fst, ilabel_comp); + } + + // TODO: should we call rmepsilon on boosting_fst ? + + // run composition (measure time), + VectorFst decode_fst; + fst::Compose(*hclg_fst, boosting_fst, &decode_fst); + + // TODO: should we sort the 'decode_fst' by isymbols ? + // (we don't do it, as it would take time. + // not sure it decoding would be faster if + // decode_fst was sorted by isymbols) + + elapsed_compose += timer_compose.Elapsed(); + + DecodableAmNnetSimple nnet_decodable( + decodable_opts, trans_model, am_nnet, + features, ivector, online_ivectors, + online_ivector_period, &compiler); + + LatticeFasterDecoder decoder(decode_fst, config); + + double like; + if (DecodeUtteranceLatticeFaster( + decoder, nnet_decodable, trans_model, word_syms.get(), utt, + decodable_opts.acoustic_scale, determinize, allow_partial, + &alignment_writer, &words_writer, &compact_lattice_writer, + &lattice_writer, + &like)) { + tot_like += like; + frame_count += nnet_decodable.NumFramesReady(); + num_success++; + } else num_fail++; + } + } + } + + kaldi::int64 input_frame_count = + frame_count * decodable_opts.frame_subsampling_factor; + + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 frames/sec is " + << (elapsed * 100.0 / input_frame_count); + KALDI_LOG << "Composition time "<< elapsed_compose + << "s (" << (elapsed_compose * 100.0 / elapsed) << "%)"; + KALDI_LOG << "Done " << num_success << " utterances, failed for " + << num_fail; + KALDI_LOG << "Overall log-likelihood per frame is " + << (tot_like / frame_count) << " over " + << frame_count << " frames."; + + if (num_success != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} From d87ec232edb0faccdb437108e7f984affc750484 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Mon, 4 Jan 2021 16:21:49 +0100 Subject: [PATCH 3/6] adding script for on-the-fly boosting of HCLG graph --- egs/wsj/s5/steps/nnet3/decode_compose.sh | 181 ++++++++++++++++++++ src/nnet3bin/nnet3-latgen-faster-compose.cc | 8 + 2 files changed, 189 insertions(+) create mode 100755 egs/wsj/s5/steps/nnet3/decode_compose.sh diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh new file mode 100755 index 00000000000..9f4e8bef020 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh @@ -0,0 +1,181 @@ +#!/usr/bin/env bash + +# Copyright 2021 Brno University of Technology (Author: Karel Vesely). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. +# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting +# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing). + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. + # In that case it is recommended to set num-threads to a large + # number, e.g. 20 if you have that many free CPU slots on a GPU + # node, and to use a small number of jobs. +scoring_opts= +skip_diagnostics=false +skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false + +boosting_graphs= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1 + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +if [ -f $srcdir/cmvn_opts ]; then + cmvn_opts=`cat $srcdir/cmvn_opts` +else + cmvn_opts="--norm-means=false --norm-vars=false" +fi + +#thread_string= +#if $use_gpu; then +# if [ $num_threads -eq 1 ]; then +# echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding." +# fi +# thread_string="-batch --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads --gpu 1" +#elif [ $num_threads -gt 1 ]; then +# thread_string="-parallel --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads" +#fi +queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading... + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +## Set up features. +if [ -f $srcdir/online_cmvn ]; then online_cmvn=true +else online_cmvn=false; fi + +if ! $online_cmvn; then + echo "$0: feature type is raw" + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +else + echo "$0: feature type is raw (apply-cmvn-online)" + feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +elif [ -f $srcdir/init/info.txt ]; then + frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt) + if [ ! -z $frame_subsampling_factor ]; then + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + fi +fi + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc index ad9f8d38d21..e560a8a15af 100644 --- a/src/nnet3bin/nnet3-latgen-faster-compose.cc +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -225,6 +225,14 @@ int main(int argc, char *argv[]) { // not sure it decoding would be faster if // decode_fst was sorted by isymbols) + // Check that composed graph is non-empty, + if (decode_fst.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty 'decode_fst' HCLG for utterance " + << utt << " (bad boosting graph?)"; + num_fail++; + continue; + } + elapsed_compose += timer_compose.Elapsed(); DecodableAmNnetSimple nnet_decodable( From f9ae938499190fc6f9e3ea0f454bce455969b0d1 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Tue, 5 Jan 2021 19:00:37 +0100 Subject: [PATCH 4/6] updating the HCLG boosting code, debugging --- egs/wsj/s5/steps/nnet3/decode_compose.sh | 4 +- src/nnet3bin/nnet3-latgen-faster-compose.cc | 172 +++++++++++--------- 2 files changed, 95 insertions(+), 81 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh index 9f4e8bef020..8d003b60eac 100755 --- a/egs/wsj/s5/steps/nnet3/decode_compose.sh +++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh @@ -22,8 +22,8 @@ min_active=200 ivector_scale=1.0 lattice_beam=8.0 # Beam we use in lattice generation. iter=final -num_threads=1 # if >1, will use gmm-latgen-faster-parallel -use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. +#num_threads=1 # if >1, will use gmm-latgen-faster-parallel +#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. # In that case it is recommended to set num-threads to a large # number, e.g. 20 if you have that many free CPU slots on a GPU # node, and to use a small number of jobs. diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc index e560a8a15af..ad230540584 100644 --- a/src/nnet3bin/nnet3-latgen-faster-compose.cc +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -31,6 +31,7 @@ #include "base/timer.h" #include +#include #include @@ -154,106 +155,119 @@ int main(int argc, char *argv[]) { RandomAccessTableReader boosting_fst_reader(boosting_fst_rspecifier); - // HCLG FST is just one FST, not a table of FSTs. - auto hclg_fst = std::unique_ptr>(fst::ReadFstKaldi(hclg_fst_rxfilename)); + // 'hclg_fst' is a single FST. + VectorFst hclg_fst; + { + auto hclg_fst_tmp = std::unique_ptr>(fst::ReadFstKaldiGeneric(hclg_fst_rxfilename)); + hclg_fst = VectorFst(*hclg_fst_tmp); // Fst -> VectorFst, as it has to be MutableFst... + // 'hclg_fst_tmp' is deleted by 'going out of scope' ... + } // make sure hclg is sorted on olabel - if (hclg_fst->Properties(fst::kOLabelSorted, true) == 0) { + if (hclg_fst.Properties(fst::kOLabelSorted, true) == 0) { fst::OLabelCompare olabel_comp; - fst::ArcSort(hclg_fst.get(), olabel_comp); + fst::ArcSort(&hclg_fst, olabel_comp); } timer.Reset(); - { - - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - const Matrix &features (feature_reader.Value()); - if (features.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << utt; + //// MAIN LOOP //// + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + const Matrix &features (feature_reader.Value()); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_fail++; + continue; + } + const Matrix *online_ivectors = NULL; + const Vector *ivector = NULL; + if (!ivector_rspecifier.empty()) { + if (!ivector_reader.HasKey(utt)) { + KALDI_WARN << "No iVector available for utterance " << utt; num_fail++; continue; + } else { + ivector = &ivector_reader.Value(utt); } - const Matrix *online_ivectors = NULL; - const Vector *ivector = NULL; - if (!ivector_rspecifier.empty()) { - if (!ivector_reader.HasKey(utt)) { - KALDI_WARN << "No iVector available for utterance " << utt; - num_fail++; - continue; - } else { - ivector = &ivector_reader.Value(utt); - } - } - if (!online_ivector_rspecifier.empty()) { - if (!online_ivector_reader.HasKey(utt)) { - KALDI_WARN << "No online iVector available for utterance " << utt; - num_fail++; - continue; - } else { - online_ivectors = &online_ivector_reader.Value(utt); - } - } - - // get the boosting graph, - VectorFst boosting_fst; - if (!boosting_fst_reader.HasKey(utt)) { - KALDI_WARN << "No boosting fst for utterance " << utt; + } + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(utt)) { + KALDI_WARN << "No online iVector available for utterance " << utt; num_fail++; continue; } else { - boosting_fst = boosting_fst_reader.Value(utt); // copy, + online_ivectors = &online_ivector_reader.Value(utt); } + } - timer_compose.Reset(); - - // make sure boosting graph is sorted on ilabel, - if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) { - fst::ILabelCompare ilabel_comp; - fst::ArcSort(&boosting_fst, ilabel_comp); - } + // get the boosting graph, + VectorFst boosting_fst; + if (!boosting_fst_reader.HasKey(utt)) { + KALDI_WARN << "No boosting fst for utterance " << utt; + num_fail++; + continue; + } else { + boosting_fst = boosting_fst_reader.Value(utt); // copy, + } - // TODO: should we call rmepsilon on boosting_fst ? + timer_compose.Reset(); - // run composition (measure time), - VectorFst decode_fst; - fst::Compose(*hclg_fst, boosting_fst, &decode_fst); + // RmEpsilon saved 30% of composition runtime... + // - Note: we are loading 2-state graphs with eps back-link to the initial state. + if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) { + fst::RmEpsilon(&boosting_fst); + } - // TODO: should we sort the 'decode_fst' by isymbols ? - // (we don't do it, as it would take time. - // not sure it decoding would be faster if - // decode_fst was sorted by isymbols) + // make sure boosting graph is sorted on ilabel, + if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) { + fst::ILabelCompare ilabel_comp; + fst::ArcSort(&boosting_fst, ilabel_comp); + } - // Check that composed graph is non-empty, - if (decode_fst.Start() == fst::kNoStateId) { - KALDI_WARN << "Empty 'decode_fst' HCLG for utterance " - << utt << " (bad boosting graph?)"; - num_fail++; - continue; - } + // run composition, + VectorFst decode_fst; + fst::Compose(hclg_fst, boosting_fst, &decode_fst); - elapsed_compose += timer_compose.Elapsed(); - - DecodableAmNnetSimple nnet_decodable( - decodable_opts, trans_model, am_nnet, - features, ivector, online_ivectors, - online_ivector_period, &compiler); - - LatticeFasterDecoder decoder(decode_fst, config); - - double like; - if (DecodeUtteranceLatticeFaster( - decoder, nnet_decodable, trans_model, word_syms.get(), utt, - decodable_opts.acoustic_scale, determinize, allow_partial, - &alignment_writer, &words_writer, &compact_lattice_writer, - &lattice_writer, - &like)) { - tot_like += like; - frame_count += nnet_decodable.NumFramesReady(); - num_success++; - } else num_fail++; + // check that composed graph is non-empty, + if (decode_fst.Start() == fst::kNoStateId) { + KALDI_WARN << "Empty 'decode_fst' HCLG for utterance " + << utt << " (bad boosting graph?)"; + num_fail++; + continue; } + + elapsed_compose += timer_compose.Elapsed(); + + DecodableAmNnetSimple nnet_decodable( + decodable_opts, trans_model, am_nnet, + features, ivector, online_ivectors, + online_ivector_period, &compiler); + + // Note: decode_fst is VectorFst, not ConstFst. + // + // OpenFst docs say that more specific iterators + // are faster than generic iterators. And in HCLG + // is usually loaded for decoding as ConstFst. + // + // auto decode_fst_ = ConstFst(decode_fst); + // + // In this way, I tried to cast VectorFst to ConstFst, + // but this made the decoding 20% slower. + // + LatticeFasterDecoder decoder(decode_fst, config); + + double like; + if (DecodeUtteranceLatticeFaster( + decoder, nnet_decodable, trans_model, word_syms.get(), utt, + decodable_opts.acoustic_scale, determinize, allow_partial, + &alignment_writer, &words_writer, &compact_lattice_writer, + &lattice_writer, + &like)) { + tot_like += like; + frame_count += nnet_decodable.NumFramesReady(); + num_success++; + } else num_fail++; } } From 0e31ffa77782a8f704bf162f43a917955b13d26c Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Mon, 21 Jun 2021 16:20:55 +0200 Subject: [PATCH 5/6] Apply suggestions from code review Adding suggestions from code review. Co-authored-by: Cy 'kkm' Katsnelson --- egs/wsj/s5/steps/nnet3/decode_compose.sh | 4 +-- src/latbin/lattice-compose-fsts.cc | 28 ++++++++++------- src/nnet3bin/nnet3-latgen-faster-compose.cc | 35 +++++++++++---------- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/decode_compose.sh b/egs/wsj/s5/steps/nnet3/decode_compose.sh index 8d003b60eac..5e3b80adf09 100755 --- a/egs/wsj/s5/steps/nnet3/decode_compose.sh +++ b/egs/wsj/s5/steps/nnet3/decode_compose.sh @@ -48,7 +48,7 @@ echo "$0 $@" # Print the command line for logging if [ $# -ne 3 ]; then echo "Usage: $0 [options] " echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" - echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" echo "main options (for others, see top of script file)" echo " --config # config containing options" @@ -67,7 +67,7 @@ fi graphdir=$1 data=$2 dir=$3 -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +srcdir=$(dirname $dir) # Assume model directory one level up from decoding directory. model=$srcdir/$iter.mdl [ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1 diff --git a/src/latbin/lattice-compose-fsts.cc b/src/latbin/lattice-compose-fsts.cc index bbdcc670f01..8fd889e192f 100644 --- a/src/latbin/lattice-compose-fsts.cc +++ b/src/latbin/lattice-compose-fsts.cc @@ -47,12 +47,14 @@ int main(int argc, char *argv[]) { bool write_compact = true; int32 num_states_cache = 50000; - int32 phi_label = fst::kNoLabel; // == -1 - po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM"); + int32 phi_label = fst::kNoLabel; // == -1 + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form."); + po.Register("phi-label", &phi_label, + "If >0, the label on backoff arcs of the LM"); po.Register("num-states-cache", &num_states_cache, - "Number of states we cache when mapping LM FST to lattice type. " - "More -> more memory but faster."); + "Number of states we cache when mapping LM FST to lattice type." + " More -> more memory but faster."); po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -72,10 +74,11 @@ int main(int argc, char *argv[]) { CompactLatticeWriter compact_lattice_writer; LatticeWriter lattice_writer; - if (write_compact) + if (write_compact) { compact_lattice_writer.Open(lats_wspecifier); - else + } else { lattice_writer.Open(lats_wspecifier); + } if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) { std::string fst_rxfilename = arg2; @@ -105,8 +108,11 @@ int main(int argc, char *argv[]) { Lattice lat1 = lattice_reader1.Value(); ArcSort(&lat1, fst::OLabelCompare()); Lattice composed_lat; - if (phi_label > 0) PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); - else Compose(lat1, mapped_fst2, &composed_lat); + if (phi_label > 0) { + PhiCompose(lat1, mapped_fst2, phi_label, &composed_lat); + } else { + Compose(lat1, mapped_fst2, &composed_lat); + } if (composed_lat.Start() == fst::kNoStateId) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; @@ -123,7 +129,7 @@ int main(int argc, char *argv[]) { } delete fst2; } else { - // composing with each utterance with different fst, + // Compose each utterance with its matching (by key) FST. std::string fst_rspecifier2 = arg2; RandomAccessTableReader fst_reader2(fst_rspecifier2); @@ -135,7 +141,7 @@ int main(int argc, char *argv[]) { if (!fst_reader2.HasKey(key)) { KALDI_WARN << "Not producing output for utterance " << key - << " because not present in second table."; + << " because it's not present in second table."; n_fail++; continue; } diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc index ad230540584..7d9a5081dc5 100644 --- a/src/nnet3bin/nnet3-latgen-faster-compose.cc +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -20,19 +20,20 @@ // limitations under the License. +#include +#include + +#include + #include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "tree/context-dep.h" -#include "hmm/transition-model.h" -#include "fstext/fstext-lib.h" +#include "base/timer.h" #include "decoder/decoder-wrappers.h" +#include "fstext/fstext-lib.h" +#include "hmm/transition-model.h" #include "nnet3/nnet-am-decodable-simple.h" #include "nnet3/nnet-utils.h" -#include "base/timer.h" - -#include -#include -#include +#include "tree/context-dep.h" +#include "util/common-utils.h" int main(int argc, char *argv[]) { @@ -121,9 +122,10 @@ int main(int argc, char *argv[]) { CompactLatticeWriter compact_lattice_writer; LatticeWriter lattice_writer; if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) - : lattice_writer.Open(lattice_wspecifier))) + : lattice_writer.Open(lattice_wspecifier))) { KALDI_ERR << "Could not open table for writing lattices: " - << lattice_wspecifier; + << lattice_wspecifier; + } RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); @@ -213,13 +215,13 @@ int main(int argc, char *argv[]) { timer_compose.Reset(); - // RmEpsilon saved 30% of composition runtime... + // RmEpsilon saved 30% of composition runtime. // - Note: we are loading 2-state graphs with eps back-link to the initial state. if (boosting_fst.Properties(fst::kIEpsilons, true) != 0) { fst::RmEpsilon(&boosting_fst); } - // make sure boosting graph is sorted on ilabel, + // Make sure boosting graph is sorted on ilabel. if (boosting_fst.Properties(fst::kILabelSorted, true) == 0) { fst::ILabelCompare ilabel_comp; fst::ArcSort(&boosting_fst, ilabel_comp); @@ -267,7 +269,9 @@ int main(int argc, char *argv[]) { tot_like += like; frame_count += nnet_decodable.NumFramesReady(); num_success++; - } else num_fail++; + } else { + ++num_fail; + } } } @@ -286,8 +290,7 @@ int main(int argc, char *argv[]) { << (tot_like / frame_count) << " over " << frame_count << " frames."; - if (num_success != 0) return 0; - else return 1; + return num_success != 0 ? 0 : 1; } catch(const std::exception &e) { std::cerr << e.what(); return -1; From 634a7159aa7e8e5e0815f1c34e1b1954b9b5aabd Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Mon, 31 Jan 2022 18:22:04 +0100 Subject: [PATCH 6/6] nnet3-latgen-faster-compose, add RhoCompose and nnet3/decode_compose_rho.sh script --- egs/wsj/s5/steps/nnet3/decode_compose_rho.sh | 184 +++++++++++++++++++ src/configure_mkl | 21 +++ src/nnet3bin/nnet3-latgen-faster-compose.cc | 14 +- 3 files changed, 217 insertions(+), 2 deletions(-) create mode 100755 egs/wsj/s5/steps/nnet3/decode_compose_rho.sh create mode 100755 src/configure_mkl diff --git a/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh new file mode 100755 index 00000000000..362bec260ca --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_compose_rho.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +# Copyright 2021 Brno University of Technology (Author: Karel Vesely). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. +# It calls 'nnet3-latgen-faster-compose', which does on-the-fly boosting +# of HCLG graph by composing it with per-utterance boosting graphs (pre-existing). + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +#num_threads=1 # if >1, will use gmm-latgen-faster-parallel +#use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. + # In that case it is recommended to set num-threads to a large + # number, e.g. 20 if you have that many free CPU slots on a GPU + # node, and to use a small number of jobs. +scoring_opts= +skip_diagnostics=false +skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false + +boosting_graphs= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo " --online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=$(dirname $dir) # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + +[ -z "$boosting_graphs" ] && echo "Error, \$boosting_graphs have to be set !" && exit 1 + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +if [ -f $srcdir/cmvn_opts ]; then + cmvn_opts=`cat $srcdir/cmvn_opts` +else + cmvn_opts="--norm-means=false --norm-vars=false" +fi + +#thread_string= +#if $use_gpu; then +# if [ $num_threads -eq 1 ]; then +# echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding." +# fi +# thread_string="-batch --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads --gpu 1" +#elif [ $num_threads -gt 1 ]; then +# thread_string="-parallel --num-threads=$num_threads" +# queue_opt="--num-threads $num_threads" +#fi +queue_opt="--num-threads 1" # 1 thread, we do on-the-fly boosting, the binary has no multi-threading... + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +## Set up features. +if [ -f $srcdir/online_cmvn ]; then online_cmvn=true +else online_cmvn=false; fi + +if ! $online_cmvn; then + echo "$0: feature type is raw" + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +else + echo "$0: feature type is raw (apply-cmvn-online)" + feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +elif [ -f $srcdir/init/info.txt ]; then + frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt) + if [ ! -z $frame_subsampling_factor ]; then + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + fi +fi + +rho_label=$(grep '#0' $graphdir/words.txt | awk '{ print $2; }') + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster-compose $ivector_opts $frame_subsampling_opt \ + --rho-label=$rho_label \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$boosting_graphs" "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/src/configure_mkl b/src/configure_mkl new file mode 100755 index 00000000000..0f2936471f7 --- /dev/null +++ b/src/configure_mkl @@ -0,0 +1,21 @@ +#!/bin/bash + +# Get the MKL config, +#source /usr/local/share/intel/mkl/bin/mklvars.sh intel64 ilp64 # outdated... +export MKLROOT=/usr/local/share/intel/mkl/2021.4.0 + +# Use older compiler +# export CXX=g++-7.4 # CUDA 10.0 works well with version '7.4', +export CXX=g++-9.4 # CUDA 11.2, recommended gcc version 9.*, + +export CXXFLAGS="-march=x86-64" # compile for 'generic' 64bit CPU, +#export CXXFLAGS="-march=westmere" # oldest architecutre we have at BUT (X5675, Westmere, blade024), + +# Use different CUDA, +# CUDATK=/usr/local/share/cuda-10.2.89 # CUDA 10.0 supports our default gcc 7.4.0, +# CUDATK=/usr/local/share/cuda-11.0.194 +CUDATK=/usr/local/share/cuda-11.2 +# and add '--cudatk-dir=$CUDATK' to './configure' + +# Generate kaldi.mk, +./configure --mkl-root=$MKLROOT --cudatk-dir=$CUDATK --shared diff --git a/src/nnet3bin/nnet3-latgen-faster-compose.cc b/src/nnet3bin/nnet3-latgen-faster-compose.cc index 7d9a5081dc5..d4e7c094d30 100644 --- a/src/nnet3bin/nnet3-latgen-faster-compose.cc +++ b/src/nnet3bin/nnet3-latgen-faster-compose.cc @@ -64,6 +64,8 @@ int main(int argc, char *argv[]) { Timer timer, timer_compose; double elapsed_compose = 0.0; + int32 rho_label = fst::kNoLabel; // == -1 + bool allow_partial = false; LatticeFasterDecoderConfig config; NnetSimpleComputationOptions decodable_opts; @@ -75,6 +77,10 @@ int main(int argc, char *argv[]) { int32 online_ivector_period = 0; config.Register(&po); decodable_opts.Register(&po); + + po.Register("rho-label", &rho_label, + "If >0, symbol for 'match the rest' in the biasing graph boosting_fst"); + po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]"); po.Register("allow-partial", &allow_partial, @@ -125,7 +131,7 @@ int main(int argc, char *argv[]) { : lattice_writer.Open(lattice_wspecifier))) { KALDI_ERR << "Could not open table for writing lattices: " << lattice_wspecifier; - } + } RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); @@ -229,7 +235,11 @@ int main(int argc, char *argv[]) { // run composition, VectorFst decode_fst; - fst::Compose(hclg_fst, boosting_fst, &decode_fst); + if (rho_label > 0) { + fst::RhoCompose(hclg_fst, boosting_fst, rho_label, &decode_fst); + } else { + fst::Compose(hclg_fst, boosting_fst, &decode_fst); + } // check that composed graph is non-empty, if (decode_fst.Start() == fst::kNoStateId) {